Initial Commit
This commit is contained in:
27
openssl-1.0.2f/crypto/bn/asm/README
Normal file
27
openssl-1.0.2f/crypto/bn/asm/README
Normal file
@@ -0,0 +1,27 @@
|
||||
<OBSOLETE>
|
||||
|
||||
All assember in this directory are just version of the file
|
||||
crypto/bn/bn_asm.c.
|
||||
|
||||
Quite a few of these files are just the assember output from gcc since on
|
||||
quite a few machines they are 2 times faster than the system compiler.
|
||||
|
||||
For the x86, I have hand written assember because of the bad job all
|
||||
compilers seem to do on it. This normally gives a 2 time speed up in the RSA
|
||||
routines.
|
||||
|
||||
For the DEC alpha, I also hand wrote the assember (except the division which
|
||||
is just the output from the C compiler pasted on the end of the file).
|
||||
On the 2 alpha C compilers I had access to, it was not possible to do
|
||||
64b x 64b -> 128b calculations (both long and the long long data types
|
||||
were 64 bits). So the hand assember gives access to the 128 bit result and
|
||||
a 2 times speedup :-).
|
||||
|
||||
There are 3 versions of assember for the HP PA-RISC.
|
||||
|
||||
pa-risc.s is the origional one which works fine and generated using gcc :-)
|
||||
|
||||
pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
|
||||
by Chris Ruemmler from HP (with some help from the HP C compiler).
|
||||
|
||||
</OBSOLETE>
|
||||
321
openssl-1.0.2f/crypto/bn/asm/alpha-mont.pl
Normal file
321
openssl-1.0.2f/crypto/bn/asm/alpha-mont.pl
Normal file
@@ -0,0 +1,321 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# On 21264 RSA sign performance improves by 70/35/20/15 percent for
|
||||
# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
|
||||
# instructed to '-tune host' code with in-line assembler. Other
|
||||
# benchmarks improve by 15-20%. To anchor it to something else, the
|
||||
# code provides approximately the same performance per GHz as AMD64.
|
||||
# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
|
||||
# difference.
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp="a0"; # BN_ULONG *rp,
|
||||
$ap="a1"; # const BN_ULONG *ap,
|
||||
$bp="a2"; # const BN_ULONG *bp,
|
||||
$np="a3"; # const BN_ULONG *np,
|
||||
$n0="a4"; # const BN_ULONG *n0,
|
||||
$num="a5"; # int num);
|
||||
|
||||
$lo0="t0";
|
||||
$hi0="t1";
|
||||
$lo1="t2";
|
||||
$hi1="t3";
|
||||
$aj="t4";
|
||||
$bi="t5";
|
||||
$nj="t6";
|
||||
$tp="t7";
|
||||
$alo="t8";
|
||||
$ahi="t9";
|
||||
$nlo="t10";
|
||||
$nhi="t11";
|
||||
$tj="t12";
|
||||
$i="s3";
|
||||
$j="s4";
|
||||
$m1="s5";
|
||||
|
||||
$code=<<___;
|
||||
#ifdef __linux__
|
||||
#include <asm/regdef.h>
|
||||
#else
|
||||
#include <asm.h>
|
||||
#include <regdef.h>
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
|
||||
.globl bn_mul_mont
|
||||
.align 5
|
||||
.ent bn_mul_mont
|
||||
bn_mul_mont:
|
||||
lda sp,-48(sp)
|
||||
stq ra,0(sp)
|
||||
stq s3,8(sp)
|
||||
stq s4,16(sp)
|
||||
stq s5,24(sp)
|
||||
stq fp,32(sp)
|
||||
mov sp,fp
|
||||
.mask 0x0400f000,-48
|
||||
.frame fp,48,ra
|
||||
.prologue 0
|
||||
|
||||
.align 4
|
||||
.set reorder
|
||||
sextl $num,$num
|
||||
mov 0,v0
|
||||
cmplt $num,4,AT
|
||||
bne AT,.Lexit
|
||||
|
||||
ldq $hi0,0($ap) # ap[0]
|
||||
s8addq $num,16,AT
|
||||
ldq $aj,8($ap)
|
||||
subq sp,AT,sp
|
||||
ldq $bi,0($bp) # bp[0]
|
||||
lda AT,-4096(zero) # mov -4096,AT
|
||||
ldq $n0,0($n0)
|
||||
and sp,AT,sp
|
||||
|
||||
mulq $hi0,$bi,$lo0
|
||||
ldq $hi1,0($np) # np[0]
|
||||
umulh $hi0,$bi,$hi0
|
||||
ldq $nj,8($np)
|
||||
|
||||
mulq $lo0,$n0,$m1
|
||||
|
||||
mulq $hi1,$m1,$lo1
|
||||
umulh $hi1,$m1,$hi1
|
||||
|
||||
addq $lo1,$lo0,$lo1
|
||||
cmpult $lo1,$lo0,AT
|
||||
addq $hi1,AT,$hi1
|
||||
|
||||
mulq $aj,$bi,$alo
|
||||
mov 2,$j
|
||||
umulh $aj,$bi,$ahi
|
||||
mov sp,$tp
|
||||
|
||||
mulq $nj,$m1,$nlo
|
||||
s8addq $j,$ap,$aj
|
||||
umulh $nj,$m1,$nhi
|
||||
s8addq $j,$np,$nj
|
||||
.align 4
|
||||
.L1st:
|
||||
.set noreorder
|
||||
ldq $aj,0($aj)
|
||||
addl $j,1,$j
|
||||
ldq $nj,0($nj)
|
||||
lda $tp,8($tp)
|
||||
|
||||
addq $alo,$hi0,$lo0
|
||||
mulq $aj,$bi,$alo
|
||||
cmpult $lo0,$hi0,AT
|
||||
addq $nlo,$hi1,$lo1
|
||||
|
||||
mulq $nj,$m1,$nlo
|
||||
addq $ahi,AT,$hi0
|
||||
cmpult $lo1,$hi1,v0
|
||||
cmplt $j,$num,$tj
|
||||
|
||||
umulh $aj,$bi,$ahi
|
||||
addq $nhi,v0,$hi1
|
||||
addq $lo1,$lo0,$lo1
|
||||
s8addq $j,$ap,$aj
|
||||
|
||||
umulh $nj,$m1,$nhi
|
||||
cmpult $lo1,$lo0,v0
|
||||
addq $hi1,v0,$hi1
|
||||
s8addq $j,$np,$nj
|
||||
|
||||
stq $lo1,-8($tp)
|
||||
nop
|
||||
unop
|
||||
bne $tj,.L1st
|
||||
.set reorder
|
||||
|
||||
addq $alo,$hi0,$lo0
|
||||
addq $nlo,$hi1,$lo1
|
||||
cmpult $lo0,$hi0,AT
|
||||
cmpult $lo1,$hi1,v0
|
||||
addq $ahi,AT,$hi0
|
||||
addq $nhi,v0,$hi1
|
||||
|
||||
addq $lo1,$lo0,$lo1
|
||||
cmpult $lo1,$lo0,v0
|
||||
addq $hi1,v0,$hi1
|
||||
|
||||
stq $lo1,0($tp)
|
||||
|
||||
addq $hi1,$hi0,$hi1
|
||||
cmpult $hi1,$hi0,AT
|
||||
stq $hi1,8($tp)
|
||||
stq AT,16($tp)
|
||||
|
||||
mov 1,$i
|
||||
.align 4
|
||||
.Louter:
|
||||
s8addq $i,$bp,$bi
|
||||
ldq $hi0,0($ap)
|
||||
ldq $aj,8($ap)
|
||||
ldq $bi,0($bi)
|
||||
ldq $hi1,0($np)
|
||||
ldq $nj,8($np)
|
||||
ldq $tj,0(sp)
|
||||
|
||||
mulq $hi0,$bi,$lo0
|
||||
umulh $hi0,$bi,$hi0
|
||||
|
||||
addq $lo0,$tj,$lo0
|
||||
cmpult $lo0,$tj,AT
|
||||
addq $hi0,AT,$hi0
|
||||
|
||||
mulq $lo0,$n0,$m1
|
||||
|
||||
mulq $hi1,$m1,$lo1
|
||||
umulh $hi1,$m1,$hi1
|
||||
|
||||
addq $lo1,$lo0,$lo1
|
||||
cmpult $lo1,$lo0,AT
|
||||
mov 2,$j
|
||||
addq $hi1,AT,$hi1
|
||||
|
||||
mulq $aj,$bi,$alo
|
||||
mov sp,$tp
|
||||
umulh $aj,$bi,$ahi
|
||||
|
||||
mulq $nj,$m1,$nlo
|
||||
s8addq $j,$ap,$aj
|
||||
umulh $nj,$m1,$nhi
|
||||
.align 4
|
||||
.Linner:
|
||||
.set noreorder
|
||||
ldq $tj,8($tp) #L0
|
||||
nop #U1
|
||||
ldq $aj,0($aj) #L1
|
||||
s8addq $j,$np,$nj #U0
|
||||
|
||||
ldq $nj,0($nj) #L0
|
||||
nop #U1
|
||||
addq $alo,$hi0,$lo0 #L1
|
||||
lda $tp,8($tp)
|
||||
|
||||
mulq $aj,$bi,$alo #U1
|
||||
cmpult $lo0,$hi0,AT #L0
|
||||
addq $nlo,$hi1,$lo1 #L1
|
||||
addl $j,1,$j
|
||||
|
||||
mulq $nj,$m1,$nlo #U1
|
||||
addq $ahi,AT,$hi0 #L0
|
||||
addq $lo0,$tj,$lo0 #L1
|
||||
cmpult $lo1,$hi1,v0 #U0
|
||||
|
||||
umulh $aj,$bi,$ahi #U1
|
||||
cmpult $lo0,$tj,AT #L0
|
||||
addq $lo1,$lo0,$lo1 #L1
|
||||
addq $nhi,v0,$hi1 #U0
|
||||
|
||||
umulh $nj,$m1,$nhi #U1
|
||||
s8addq $j,$ap,$aj #L0
|
||||
cmpult $lo1,$lo0,v0 #L1
|
||||
cmplt $j,$num,$tj #U0 # borrow $tj
|
||||
|
||||
addq $hi0,AT,$hi0 #L0
|
||||
addq $hi1,v0,$hi1 #U1
|
||||
stq $lo1,-8($tp) #L1
|
||||
bne $tj,.Linner #U0
|
||||
.set reorder
|
||||
|
||||
ldq $tj,8($tp)
|
||||
addq $alo,$hi0,$lo0
|
||||
addq $nlo,$hi1,$lo1
|
||||
cmpult $lo0,$hi0,AT
|
||||
cmpult $lo1,$hi1,v0
|
||||
addq $ahi,AT,$hi0
|
||||
addq $nhi,v0,$hi1
|
||||
|
||||
addq $lo0,$tj,$lo0
|
||||
cmpult $lo0,$tj,AT
|
||||
addq $hi0,AT,$hi0
|
||||
|
||||
ldq $tj,16($tp)
|
||||
addq $lo1,$lo0,$j
|
||||
cmpult $j,$lo0,v0
|
||||
addq $hi1,v0,$hi1
|
||||
|
||||
addq $hi1,$hi0,$lo1
|
||||
stq $j,0($tp)
|
||||
cmpult $lo1,$hi0,$hi1
|
||||
addq $lo1,$tj,$lo1
|
||||
cmpult $lo1,$tj,AT
|
||||
addl $i,1,$i
|
||||
addq $hi1,AT,$hi1
|
||||
stq $lo1,8($tp)
|
||||
cmplt $i,$num,$tj # borrow $tj
|
||||
stq $hi1,16($tp)
|
||||
bne $tj,.Louter
|
||||
|
||||
s8addq $num,sp,$tj # &tp[num]
|
||||
mov $rp,$bp # put rp aside
|
||||
mov sp,$tp
|
||||
mov sp,$ap
|
||||
mov 0,$hi0 # clear borrow bit
|
||||
|
||||
.align 4
|
||||
.Lsub: ldq $lo0,0($tp)
|
||||
ldq $lo1,0($np)
|
||||
lda $tp,8($tp)
|
||||
lda $np,8($np)
|
||||
subq $lo0,$lo1,$lo1 # tp[i]-np[i]
|
||||
cmpult $lo0,$lo1,AT
|
||||
subq $lo1,$hi0,$lo0
|
||||
cmpult $lo1,$lo0,$hi0
|
||||
or $hi0,AT,$hi0
|
||||
stq $lo0,0($rp)
|
||||
cmpult $tp,$tj,v0
|
||||
lda $rp,8($rp)
|
||||
bne v0,.Lsub
|
||||
|
||||
subq $hi1,$hi0,$hi0 # handle upmost overflow bit
|
||||
mov sp,$tp
|
||||
mov $bp,$rp # restore rp
|
||||
|
||||
and sp,$hi0,$ap
|
||||
bic $bp,$hi0,$bp
|
||||
bis $bp,$ap,$ap # ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
|
||||
lda $tp,8($tp)
|
||||
lda $rp,8($rp)
|
||||
lda $ap,8($ap)
|
||||
stq zero,-8($tp) # zap tp
|
||||
cmpult $tp,$tj,AT
|
||||
stq $aj,-8($rp)
|
||||
bne AT,.Lcopy
|
||||
mov 1,v0
|
||||
|
||||
.Lexit:
|
||||
.set noreorder
|
||||
mov fp,sp
|
||||
/*ldq ra,0(sp)*/
|
||||
ldq s3,8(sp)
|
||||
ldq s4,16(sp)
|
||||
ldq s5,24(sp)
|
||||
ldq fp,32(sp)
|
||||
lda sp,48(sp)
|
||||
ret (ra)
|
||||
.end bn_mul_mont
|
||||
.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
||||
289
openssl-1.0.2f/crypto/bn/asm/armv4-gf2m.pl
Normal file
289
openssl-1.0.2f/crypto/bn/asm/armv4-gf2m.pl
Normal file
@@ -0,0 +1,289 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# May 2011
|
||||
#
|
||||
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
|
||||
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
|
||||
# C for the time being... Except that it has two code paths: pure
|
||||
# integer code suitable for any ARMv4 and later CPU and NEON code
|
||||
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
|
||||
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
|
||||
# faster than compiler-generated code. For ECDH and ECDSA verify (but
|
||||
# not for ECDSA sign) it means 25%-45% improvement depending on key
|
||||
# length, more for longer keys. Even though NEON 1x1 multiplication
|
||||
# runs in even less cycles, ~30, improvement is measurable only on
|
||||
# longer keys. One has to optimize code elsewhere to get NEON glow...
|
||||
#
|
||||
# April 2014
|
||||
#
|
||||
# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
|
||||
# referred below, which improves ECDH and ECDSA verify benchmarks
|
||||
# by 18-40%.
|
||||
#
|
||||
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
|
||||
# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
||||
#
|
||||
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$code=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
.code 32
|
||||
___
|
||||
################
|
||||
# private interface to mul_1x1_ialu
|
||||
#
|
||||
$a="r1";
|
||||
$b="r0";
|
||||
|
||||
($a0,$a1,$a2,$a12,$a4,$a14)=
|
||||
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
|
||||
|
||||
$mask="r12";
|
||||
|
||||
$code.=<<___;
|
||||
.type mul_1x1_ialu,%function
|
||||
.align 5
|
||||
mul_1x1_ialu:
|
||||
mov $a0,#0
|
||||
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
|
||||
str $a0,[sp,#0] @ tab[0]=0
|
||||
add $a2,$a1,$a1 @ a2=a1<<1
|
||||
str $a1,[sp,#4] @ tab[1]=a1
|
||||
eor $a12,$a1,$a2 @ a1^a2
|
||||
str $a2,[sp,#8] @ tab[2]=a2
|
||||
mov $a4,$a1,lsl#2 @ a4=a1<<2
|
||||
str $a12,[sp,#12] @ tab[3]=a1^a2
|
||||
eor $a14,$a1,$a4 @ a1^a4
|
||||
str $a4,[sp,#16] @ tab[4]=a4
|
||||
eor $a0,$a2,$a4 @ a2^a4
|
||||
str $a14,[sp,#20] @ tab[5]=a1^a4
|
||||
eor $a12,$a12,$a4 @ a1^a2^a4
|
||||
str $a0,[sp,#24] @ tab[6]=a2^a4
|
||||
and $i0,$mask,$b,lsl#2
|
||||
str $a12,[sp,#28] @ tab[7]=a1^a2^a4
|
||||
|
||||
and $i1,$mask,$b,lsr#1
|
||||
ldr $lo,[sp,$i0] @ tab[b & 0x7]
|
||||
and $i0,$mask,$b,lsr#4
|
||||
ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
|
||||
and $i1,$mask,$b,lsr#7
|
||||
ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
|
||||
eor $lo,$lo,$t1,lsl#3 @ stall
|
||||
mov $hi,$t1,lsr#29
|
||||
ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
|
||||
|
||||
and $i0,$mask,$b,lsr#10
|
||||
eor $lo,$lo,$t0,lsl#6
|
||||
eor $hi,$hi,$t0,lsr#26
|
||||
ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
|
||||
|
||||
and $i1,$mask,$b,lsr#13
|
||||
eor $lo,$lo,$t1,lsl#9
|
||||
eor $hi,$hi,$t1,lsr#23
|
||||
ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
|
||||
|
||||
and $i0,$mask,$b,lsr#16
|
||||
eor $lo,$lo,$t0,lsl#12
|
||||
eor $hi,$hi,$t0,lsr#20
|
||||
ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
|
||||
|
||||
and $i1,$mask,$b,lsr#19
|
||||
eor $lo,$lo,$t1,lsl#15
|
||||
eor $hi,$hi,$t1,lsr#17
|
||||
ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
|
||||
|
||||
and $i0,$mask,$b,lsr#22
|
||||
eor $lo,$lo,$t0,lsl#18
|
||||
eor $hi,$hi,$t0,lsr#14
|
||||
ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
|
||||
|
||||
and $i1,$mask,$b,lsr#25
|
||||
eor $lo,$lo,$t1,lsl#21
|
||||
eor $hi,$hi,$t1,lsr#11
|
||||
ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
|
||||
|
||||
tst $a,#1<<30
|
||||
and $i0,$mask,$b,lsr#28
|
||||
eor $lo,$lo,$t0,lsl#24
|
||||
eor $hi,$hi,$t0,lsr#8
|
||||
ldr $t0,[sp,$i0] @ tab[b >> 30 ]
|
||||
|
||||
eorne $lo,$lo,$b,lsl#30
|
||||
eorne $hi,$hi,$b,lsr#2
|
||||
tst $a,#1<<31
|
||||
eor $lo,$lo,$t1,lsl#27
|
||||
eor $hi,$hi,$t1,lsr#5
|
||||
eorne $lo,$lo,$b,lsl#31
|
||||
eorne $hi,$hi,$b,lsr#1
|
||||
eor $lo,$lo,$t0,lsl#30
|
||||
eor $hi,$hi,$t0,lsr#2
|
||||
|
||||
mov pc,lr
|
||||
.size mul_1x1_ialu,.-mul_1x1_ialu
|
||||
___
|
||||
################
|
||||
# void bn_GF2m_mul_2x2(BN_ULONG *r,
|
||||
# BN_ULONG a1,BN_ULONG a0,
|
||||
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
|
||||
{
|
||||
$code.=<<___;
|
||||
.global bn_GF2m_mul_2x2
|
||||
.type bn_GF2m_mul_2x2,%function
|
||||
.align 5
|
||||
bn_GF2m_mul_2x2:
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
ldr r12,.LOPENSSL_armcap
|
||||
.Lpic: ldr r12,[pc,r12]
|
||||
tst r12,#1
|
||||
bne .LNEON
|
||||
#endif
|
||||
___
|
||||
$ret="r10"; # reassigned 1st argument
|
||||
$code.=<<___;
|
||||
stmdb sp!,{r4-r10,lr}
|
||||
mov $ret,r0 @ reassign 1st argument
|
||||
mov $b,r3 @ $b=b1
|
||||
ldr r3,[sp,#32] @ load b0
|
||||
mov $mask,#7<<2
|
||||
sub sp,sp,#32 @ allocate tab[8]
|
||||
|
||||
bl mul_1x1_ialu @ a1·b1
|
||||
str $lo,[$ret,#8]
|
||||
str $hi,[$ret,#12]
|
||||
|
||||
eor $b,$b,r3 @ flip b0 and b1
|
||||
eor $a,$a,r2 @ flip a0 and a1
|
||||
eor r3,r3,$b
|
||||
eor r2,r2,$a
|
||||
eor $b,$b,r3
|
||||
eor $a,$a,r2
|
||||
bl mul_1x1_ialu @ a0·b0
|
||||
str $lo,[$ret]
|
||||
str $hi,[$ret,#4]
|
||||
|
||||
eor $a,$a,r2
|
||||
eor $b,$b,r3
|
||||
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
|
||||
___
|
||||
@r=map("r$_",(6..9));
|
||||
$code.=<<___;
|
||||
ldmia $ret,{@r[0]-@r[3]}
|
||||
eor $lo,$lo,$hi
|
||||
eor $hi,$hi,@r[1]
|
||||
eor $lo,$lo,@r[0]
|
||||
eor $hi,$hi,@r[2]
|
||||
eor $lo,$lo,@r[3]
|
||||
eor $hi,$hi,@r[3]
|
||||
str $hi,[$ret,#8]
|
||||
eor $lo,$lo,$hi
|
||||
add sp,sp,#32 @ destroy tab[8]
|
||||
str $lo,[$ret,#4]
|
||||
|
||||
#if __ARM_ARCH__>=5
|
||||
ldmia sp!,{r4-r10,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r10,lr}
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
___
|
||||
}
|
||||
{
|
||||
my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
|
||||
my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
|
||||
|
||||
$code.=<<___;
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
.align 5
|
||||
.LNEON:
|
||||
ldr r12, [sp] @ 5th argument
|
||||
vmov.32 $a, r2, r1
|
||||
vmov.32 $b, r12, r3
|
||||
vmov.i64 $k48, #0x0000ffffffffffff
|
||||
vmov.i64 $k32, #0x00000000ffffffff
|
||||
vmov.i64 $k16, #0x000000000000ffff
|
||||
|
||||
vext.8 $t0#lo, $a, $a, #1 @ A1
|
||||
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
|
||||
vext.8 $r#lo, $b, $b, #1 @ B1
|
||||
vmull.p8 $r, $a, $r#lo @ E = A*B1
|
||||
vext.8 $t1#lo, $a, $a, #2 @ A2
|
||||
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
|
||||
vext.8 $t3#lo, $b, $b, #2 @ B2
|
||||
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
|
||||
vext.8 $t2#lo, $a, $a, #3 @ A3
|
||||
veor $t0, $t0, $r @ L = E + F
|
||||
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
|
||||
vext.8 $r#lo, $b, $b, #3 @ B3
|
||||
veor $t1, $t1, $t3 @ M = G + H
|
||||
vmull.p8 $r, $a, $r#lo @ I = A*B3
|
||||
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
|
||||
vand $t0#hi, $t0#hi, $k48
|
||||
vext.8 $t3#lo, $b, $b, #4 @ B4
|
||||
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
|
||||
vand $t1#hi, $t1#hi, $k32
|
||||
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
|
||||
veor $t2, $t2, $r @ N = I + J
|
||||
veor $t0#lo, $t0#lo, $t0#hi
|
||||
veor $t1#lo, $t1#lo, $t1#hi
|
||||
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
|
||||
vand $t2#hi, $t2#hi, $k16
|
||||
vext.8 $t0, $t0, $t0, #15
|
||||
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
|
||||
vmov.i64 $t3#hi, #0
|
||||
vext.8 $t1, $t1, $t1, #14
|
||||
veor $t2#lo, $t2#lo, $t2#hi
|
||||
vmull.p8 $r, $a, $b @ D = A*B
|
||||
vext.8 $t3, $t3, $t3, #12
|
||||
vext.8 $t2, $t2, $t2, #13
|
||||
veor $t0, $t0, $t1
|
||||
veor $t2, $t2, $t3
|
||||
veor $r, $r, $t0
|
||||
veor $r, $r, $t2
|
||||
|
||||
vst1.32 {$r}, [r0]
|
||||
ret @ bx lr
|
||||
#endif
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.align 5
|
||||
.LOPENSSL_armcap:
|
||||
.word OPENSSL_armcap_P-(.Lpic+8)
|
||||
#endif
|
||||
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 5
|
||||
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
#endif
|
||||
___
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||||
s/\bret\b/bx lr/go or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT; # enforce flush
|
||||
676
openssl-1.0.2f/crypto/bn/asm/armv4-mont.pl
Normal file
676
openssl-1.0.2f/crypto/bn/asm/armv4-mont.pl
Normal file
@@ -0,0 +1,676 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# January 2007.
|
||||
|
||||
# Montgomery multiplication for ARMv4.
|
||||
#
|
||||
# Performance improvement naturally varies among CPU implementations
|
||||
# and compilers. The code was observed to provide +65-35% improvement
|
||||
# [depending on key length, less for longer keys] on ARM920T, and
|
||||
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
|
||||
# base and compiler generated code with in-lined umull and even umlal
|
||||
# instructions. The latter means that this code didn't really have an
|
||||
# "advantage" of utilizing some "secret" instruction.
|
||||
#
|
||||
# The code is interoperable with Thumb ISA and is rather compact, less
|
||||
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
|
||||
# about decorations, ABI and instruction syntax are identical.
|
||||
|
||||
# November 2013
|
||||
#
|
||||
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
|
||||
# performance improvement on Cortex-A8 is ~45-100% depending on key
|
||||
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
|
||||
# On Snapdragon S4 improvement was measured to vary from ~70% to
|
||||
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
|
||||
# rather because original integer-only code seems to perform
|
||||
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
|
||||
# different. It's being looked into, but the trouble is that
|
||||
# performance for vectors longer than 256 bits is actually couple
|
||||
# of percent worse than for integer-only code. The code is chosen
|
||||
# for execution on all NEON-capable processors, because gain on
|
||||
# others outweighs the marginal loss on Cortex-A9.
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$num="r0"; # starts as num argument, but holds &tp[num-1]
|
||||
$ap="r1";
|
||||
$bp="r2"; $bi="r2"; $rp="r2";
|
||||
$np="r3";
|
||||
$tp="r4";
|
||||
$aj="r5";
|
||||
$nj="r6";
|
||||
$tj="r7";
|
||||
$n0="r8";
|
||||
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
|
||||
$alo="r10"; # sl, gcc uses it to keep @GOT
|
||||
$ahi="r11"; # fp
|
||||
$nlo="r12"; # ip
|
||||
########### # r13 is stack pointer
|
||||
$nhi="r14"; # lr
|
||||
########### # r15 is program counter
|
||||
|
||||
#### argument block layout relative to &tp[num-1], a.k.a. $num
|
||||
$_rp="$num,#12*4";
|
||||
# ap permanently resides in r1
|
||||
$_bp="$num,#13*4";
|
||||
# np permanently resides in r3
|
||||
$_n0="$num,#14*4";
|
||||
$_num="$num,#15*4"; $_bpend=$_num;
|
||||
|
||||
$code=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
.code 32
|
||||
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.align 5
|
||||
.LOPENSSL_armcap:
|
||||
.word OPENSSL_armcap_P-bn_mul_mont
|
||||
#endif
|
||||
|
||||
.global bn_mul_mont
|
||||
.type bn_mul_mont,%function
|
||||
|
||||
.align 5
|
||||
bn_mul_mont:
|
||||
ldr ip,[sp,#4] @ load num
|
||||
stmdb sp!,{r0,r2} @ sp points at argument block
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
tst ip,#7
|
||||
bne .Lialu
|
||||
adr r0,bn_mul_mont
|
||||
ldr r2,.LOPENSSL_armcap
|
||||
ldr r0,[r0,r2]
|
||||
tst r0,#1 @ NEON available?
|
||||
ldmia sp, {r0,r2}
|
||||
beq .Lialu
|
||||
add sp,sp,#8
|
||||
b bn_mul8x_mont_neon
|
||||
.align 4
|
||||
.Lialu:
|
||||
#endif
|
||||
cmp ip,#2
|
||||
mov $num,ip @ load num
|
||||
movlt r0,#0
|
||||
addlt sp,sp,#2*4
|
||||
blt .Labrt
|
||||
|
||||
stmdb sp!,{r4-r12,lr} @ save 10 registers
|
||||
|
||||
mov $num,$num,lsl#2 @ rescale $num for byte count
|
||||
sub sp,sp,$num @ alloca(4*num)
|
||||
sub sp,sp,#4 @ +extra dword
|
||||
sub $num,$num,#4 @ "num=num-1"
|
||||
add $tp,$bp,$num @ &bp[num-1]
|
||||
|
||||
add $num,sp,$num @ $num to point at &tp[num-1]
|
||||
ldr $n0,[$_n0] @ &n0
|
||||
ldr $bi,[$bp] @ bp[0]
|
||||
ldr $aj,[$ap],#4 @ ap[0],ap++
|
||||
ldr $nj,[$np],#4 @ np[0],np++
|
||||
ldr $n0,[$n0] @ *n0
|
||||
str $tp,[$_bpend] @ save &bp[num]
|
||||
|
||||
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
|
||||
str $n0,[$_n0] @ save n0 value
|
||||
mul $n0,$alo,$n0 @ "tp[0]"*n0
|
||||
mov $nlo,#0
|
||||
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
|
||||
mov $tp,sp
|
||||
|
||||
.L1st:
|
||||
ldr $aj,[$ap],#4 @ ap[j],ap++
|
||||
mov $alo,$ahi
|
||||
ldr $nj,[$np],#4 @ np[j],np++
|
||||
mov $ahi,#0
|
||||
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
|
||||
mov $nhi,#0
|
||||
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
|
||||
adds $nlo,$nlo,$alo
|
||||
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
|
||||
adc $nlo,$nhi,#0
|
||||
cmp $tp,$num
|
||||
bne .L1st
|
||||
|
||||
adds $nlo,$nlo,$ahi
|
||||
ldr $tp,[$_bp] @ restore bp
|
||||
mov $nhi,#0
|
||||
ldr $n0,[$_n0] @ restore n0
|
||||
adc $nhi,$nhi,#0
|
||||
str $nlo,[$num] @ tp[num-1]=
|
||||
str $nhi,[$num,#4] @ tp[num]=
|
||||
|
||||
.Louter:
|
||||
sub $tj,$num,sp @ "original" $num-1 value
|
||||
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
|
||||
ldr $bi,[$tp,#4]! @ *(++bp)
|
||||
sub $np,$np,$tj @ "rewind" np to &np[1]
|
||||
ldr $aj,[$ap,#-4] @ ap[0]
|
||||
ldr $alo,[sp] @ tp[0]
|
||||
ldr $nj,[$np,#-4] @ np[0]
|
||||
ldr $tj,[sp,#4] @ tp[1]
|
||||
|
||||
mov $ahi,#0
|
||||
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
|
||||
str $tp,[$_bp] @ save bp
|
||||
mul $n0,$alo,$n0
|
||||
mov $nlo,#0
|
||||
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
|
||||
mov $tp,sp
|
||||
|
||||
.Linner:
|
||||
ldr $aj,[$ap],#4 @ ap[j],ap++
|
||||
adds $alo,$ahi,$tj @ +=tp[j]
|
||||
ldr $nj,[$np],#4 @ np[j],np++
|
||||
mov $ahi,#0
|
||||
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
|
||||
mov $nhi,#0
|
||||
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
|
||||
adc $ahi,$ahi,#0
|
||||
ldr $tj,[$tp,#8] @ tp[j+1]
|
||||
adds $nlo,$nlo,$alo
|
||||
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
|
||||
adc $nlo,$nhi,#0
|
||||
cmp $tp,$num
|
||||
bne .Linner
|
||||
|
||||
adds $nlo,$nlo,$ahi
|
||||
mov $nhi,#0
|
||||
ldr $tp,[$_bp] @ restore bp
|
||||
adc $nhi,$nhi,#0
|
||||
ldr $n0,[$_n0] @ restore n0
|
||||
adds $nlo,$nlo,$tj
|
||||
ldr $tj,[$_bpend] @ restore &bp[num]
|
||||
adc $nhi,$nhi,#0
|
||||
str $nlo,[$num] @ tp[num-1]=
|
||||
str $nhi,[$num,#4] @ tp[num]=
|
||||
|
||||
cmp $tp,$tj
|
||||
bne .Louter
|
||||
|
||||
ldr $rp,[$_rp] @ pull rp
|
||||
add $num,$num,#4 @ $num to point at &tp[num]
|
||||
sub $aj,$num,sp @ "original" num value
|
||||
mov $tp,sp @ "rewind" $tp
|
||||
mov $ap,$tp @ "borrow" $ap
|
||||
sub $np,$np,$aj @ "rewind" $np to &np[0]
|
||||
|
||||
subs $tj,$tj,$tj @ "clear" carry flag
|
||||
.Lsub: ldr $tj,[$tp],#4
|
||||
ldr $nj,[$np],#4
|
||||
sbcs $tj,$tj,$nj @ tp[j]-np[j]
|
||||
str $tj,[$rp],#4 @ rp[j]=
|
||||
teq $tp,$num @ preserve carry
|
||||
bne .Lsub
|
||||
sbcs $nhi,$nhi,#0 @ upmost carry
|
||||
mov $tp,sp @ "rewind" $tp
|
||||
sub $rp,$rp,$aj @ "rewind" $rp
|
||||
|
||||
and $ap,$tp,$nhi
|
||||
bic $np,$rp,$nhi
|
||||
orr $ap,$ap,$np @ ap=borrow?tp:rp
|
||||
|
||||
.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
|
||||
str sp,[$tp],#4 @ zap tp
|
||||
str $tj,[$rp],#4
|
||||
cmp $tp,$num
|
||||
bne .Lcopy
|
||||
|
||||
add sp,$num,#4 @ skip over tp[num+1]
|
||||
ldmia sp!,{r4-r12,lr} @ restore registers
|
||||
add sp,sp,#2*4 @ skip over {r0,r2}
|
||||
mov r0,#1
|
||||
.Labrt:
|
||||
#if __ARM_ARCH__>=5
|
||||
ret @ bx lr
|
||||
#else
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
___
|
||||
{
|
||||
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||||
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||||
|
||||
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
|
||||
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
|
||||
my ($Z,$Temp)=("q4","q5");
|
||||
my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
|
||||
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
|
||||
my $zero=&Dlo($Z);
|
||||
my $temp=&Dlo($Temp);
|
||||
|
||||
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
|
||||
my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
|
||||
|
||||
$code.=<<___;
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
.type bn_mul8x_mont_neon,%function
|
||||
.align 5
|
||||
bn_mul8x_mont_neon:
|
||||
mov ip,sp
|
||||
stmdb sp!,{r4-r11}
|
||||
vstmdb sp!,{d8-d15} @ ABI specification says so
|
||||
ldmia ip,{r4-r5} @ load rest of parameter block
|
||||
|
||||
sub $toutptr,sp,#16
|
||||
vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||||
sub $toutptr,$toutptr,$num,lsl#4
|
||||
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
|
||||
and $toutptr,$toutptr,#-64
|
||||
vld1.32 {${M0}[0]}, [$n0,:32]
|
||||
mov sp,$toutptr @ alloca
|
||||
veor $zero,$zero,$zero
|
||||
subs $inner,$num,#8
|
||||
vzip.16 $Bi,$zero
|
||||
|
||||
vmull.u32 $A0xB,$Bi,${A0}[0]
|
||||
vmull.u32 $A1xB,$Bi,${A0}[1]
|
||||
vmull.u32 $A2xB,$Bi,${A1}[0]
|
||||
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
|
||||
vmull.u32 $A3xB,$Bi,${A1}[1]
|
||||
|
||||
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
|
||||
veor $zero,$zero,$zero
|
||||
vmul.u32 $Ni,$temp,$M0
|
||||
|
||||
vmull.u32 $A4xB,$Bi,${A2}[0]
|
||||
vld1.32 {$N0-$N3}, [$nptr]!
|
||||
vmull.u32 $A5xB,$Bi,${A2}[1]
|
||||
vmull.u32 $A6xB,$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmull.u32 $A7xB,$Bi,${A3}[1]
|
||||
|
||||
bne .LNEON_1st
|
||||
|
||||
@ special case for num=8, everything is in register bank...
|
||||
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
sub $outer,$num,#1
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vmov $Temp,$A0xB
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vmov $A0xB,$A1xB
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vmov $A1xB,$A2xB
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
vmov $A2xB,$A3xB
|
||||
vmov $A3xB,$A4xB
|
||||
vshr.u64 $temp,$temp,#16
|
||||
vmov $A4xB,$A5xB
|
||||
vmov $A5xB,$A6xB
|
||||
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||||
vmov $A6xB,$A7xB
|
||||
veor $A7xB,$A7xB
|
||||
vshr.u64 $temp,$temp,#16
|
||||
|
||||
b .LNEON_outer8
|
||||
|
||||
.align 4
|
||||
.LNEON_outer8:
|
||||
vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||||
veor $zero,$zero,$zero
|
||||
vzip.16 $Bi,$zero
|
||||
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||||
|
||||
vmlal.u32 $A0xB,$Bi,${A0}[0]
|
||||
vmlal.u32 $A1xB,$Bi,${A0}[1]
|
||||
vmlal.u32 $A2xB,$Bi,${A1}[0]
|
||||
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
|
||||
vmlal.u32 $A3xB,$Bi,${A1}[1]
|
||||
|
||||
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
|
||||
veor $zero,$zero,$zero
|
||||
subs $outer,$outer,#1
|
||||
vmul.u32 $Ni,$temp,$M0
|
||||
|
||||
vmlal.u32 $A4xB,$Bi,${A2}[0]
|
||||
vmlal.u32 $A5xB,$Bi,${A2}[1]
|
||||
vmlal.u32 $A6xB,$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmlal.u32 $A7xB,$Bi,${A3}[1]
|
||||
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vmov $Temp,$A0xB
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vmov $A0xB,$A1xB
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vmov $A1xB,$A2xB
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
vmov $A2xB,$A3xB
|
||||
vmov $A3xB,$A4xB
|
||||
vshr.u64 $temp,$temp,#16
|
||||
vmov $A4xB,$A5xB
|
||||
vmov $A5xB,$A6xB
|
||||
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||||
vmov $A6xB,$A7xB
|
||||
veor $A7xB,$A7xB
|
||||
vshr.u64 $temp,$temp,#16
|
||||
|
||||
bne .LNEON_outer8
|
||||
|
||||
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||||
mov $toutptr,sp
|
||||
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
|
||||
mov $inner,$num
|
||||
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
|
||||
add $tinptr,sp,#16
|
||||
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
|
||||
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
|
||||
|
||||
b .LNEON_tail2
|
||||
|
||||
.align 4
|
||||
.LNEON_1st:
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
vld1.32 {$A0-$A3}, [$aptr]!
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
subs $inner,$inner,#8
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vld1.32 {$N0-$N1}, [$nptr]!
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||||
|
||||
vmull.u32 $A0xB,$Bi,${A0}[0]
|
||||
vld1.32 {$N2-$N3}, [$nptr]!
|
||||
vmull.u32 $A1xB,$Bi,${A0}[1]
|
||||
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||||
vmull.u32 $A2xB,$Bi,${A1}[0]
|
||||
vmull.u32 $A3xB,$Bi,${A1}[1]
|
||||
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||||
|
||||
vmull.u32 $A4xB,$Bi,${A2}[0]
|
||||
vmull.u32 $A5xB,$Bi,${A2}[1]
|
||||
vmull.u32 $A6xB,$Bi,${A3}[0]
|
||||
vmull.u32 $A7xB,$Bi,${A3}[1]
|
||||
|
||||
bne .LNEON_1st
|
||||
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
add $tinptr,sp,#16
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vld1.64 {$Temp}, [sp,:128]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
sub $outer,$num,#1
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
vld1.64 {$A0xB}, [$tinptr, :128]!
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
|
||||
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||||
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||||
veor $Z,$Z,$Z
|
||||
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||||
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||||
vst1.64 {$Z}, [$toutptr,:128]
|
||||
vshr.u64 $temp,$temp,#16
|
||||
|
||||
b .LNEON_outer
|
||||
|
||||
.align 4
|
||||
.LNEON_outer:
|
||||
vld1.32 {${Bi}[0]}, [$bptr,:32]!
|
||||
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
|
||||
vld1.32 {$A0-$A3}, [$aptr]!
|
||||
veor $zero,$zero,$zero
|
||||
mov $toutptr,sp
|
||||
vzip.16 $Bi,$zero
|
||||
sub $inner,$num,#8
|
||||
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||||
|
||||
vmlal.u32 $A0xB,$Bi,${A0}[0]
|
||||
vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
|
||||
vmlal.u32 $A1xB,$Bi,${A0}[1]
|
||||
vmlal.u32 $A2xB,$Bi,${A1}[0]
|
||||
vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
|
||||
vmlal.u32 $A3xB,$Bi,${A1}[1]
|
||||
|
||||
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
|
||||
veor $zero,$zero,$zero
|
||||
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
|
||||
vld1.64 {$A7xB},[$tinptr,:128]!
|
||||
vmul.u32 $Ni,$temp,$M0
|
||||
|
||||
vmlal.u32 $A4xB,$Bi,${A2}[0]
|
||||
vld1.32 {$N0-$N3}, [$nptr]!
|
||||
vmlal.u32 $A5xB,$Bi,${A2}[1]
|
||||
vmlal.u32 $A6xB,$Bi,${A3}[0]
|
||||
vzip.16 $Ni,$zero
|
||||
vmlal.u32 $A7xB,$Bi,${A3}[1]
|
||||
|
||||
.LNEON_inner:
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
vld1.32 {$A0-$A3}, [$aptr]!
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
subs $inner,$inner,#8
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vld1.64 {$A0xB}, [$tinptr, :128]!
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||||
|
||||
vmlal.u32 $A0xB,$Bi,${A0}[0]
|
||||
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
|
||||
vmlal.u32 $A1xB,$Bi,${A0}[1]
|
||||
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A2xB,$Bi,${A1}[0]
|
||||
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
|
||||
vmlal.u32 $A3xB,$Bi,${A1}[1]
|
||||
vld1.32 {$N0-$N3}, [$nptr]!
|
||||
|
||||
vmlal.u32 $A4xB,$Bi,${A2}[0]
|
||||
vld1.64 {$A7xB}, [$tinptr, :128]!
|
||||
vmlal.u32 $A5xB,$Bi,${A2}[1]
|
||||
vmlal.u32 $A6xB,$Bi,${A3}[0]
|
||||
vmlal.u32 $A7xB,$Bi,${A3}[1]
|
||||
|
||||
bne .LNEON_inner
|
||||
|
||||
vmlal.u32 $A0xB,$Ni,${N0}[0]
|
||||
add $tinptr,sp,#16
|
||||
vmlal.u32 $A1xB,$Ni,${N0}[1]
|
||||
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
|
||||
vmlal.u32 $A2xB,$Ni,${N1}[0]
|
||||
vld1.64 {$Temp}, [sp,:128]
|
||||
vmlal.u32 $A3xB,$Ni,${N1}[1]
|
||||
subs $outer,$outer,#1
|
||||
|
||||
vmlal.u32 $A4xB,$Ni,${N2}[0]
|
||||
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A5xB,$Ni,${N2}[1]
|
||||
vld1.64 {$A0xB}, [$tinptr, :128]!
|
||||
vshr.u64 $temp,$temp,#16
|
||||
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
|
||||
vmlal.u32 $A6xB,$Ni,${N3}[0]
|
||||
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||||
vmlal.u32 $A7xB,$Ni,${N3}[1]
|
||||
|
||||
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
|
||||
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
|
||||
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
|
||||
vshr.u64 $temp,$temp,#16
|
||||
|
||||
bne .LNEON_outer
|
||||
|
||||
mov $toutptr,sp
|
||||
mov $inner,$num
|
||||
|
||||
.LNEON_tail:
|
||||
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
|
||||
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
|
||||
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
|
||||
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
|
||||
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
|
||||
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
|
||||
vld1.64 {$A7xB}, [$tinptr, :128]!
|
||||
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
|
||||
|
||||
.LNEON_tail2:
|
||||
vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A1xB")`,#16
|
||||
vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
|
||||
vshr.u64 $temp,`&Dhi("$A1xB")`,#16
|
||||
vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A2xB")`,#16
|
||||
vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
|
||||
vshr.u64 $temp,`&Dhi("$A2xB")`,#16
|
||||
vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A3xB")`,#16
|
||||
vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
|
||||
vshr.u64 $temp,`&Dhi("$A3xB")`,#16
|
||||
vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A4xB")`,#16
|
||||
vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
|
||||
vshr.u64 $temp,`&Dhi("$A4xB")`,#16
|
||||
vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A5xB")`,#16
|
||||
vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
|
||||
vshr.u64 $temp,`&Dhi("$A5xB")`,#16
|
||||
vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A6xB")`,#16
|
||||
vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
|
||||
vld1.64 {$A0xB}, [$tinptr, :128]!
|
||||
vshr.u64 $temp,`&Dhi("$A6xB")`,#16
|
||||
vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
|
||||
|
||||
vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
|
||||
vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
|
||||
vshr.u64 $temp,`&Dlo("$A7xB")`,#16
|
||||
vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
|
||||
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
|
||||
vshr.u64 $temp,`&Dhi("$A7xB")`,#16
|
||||
vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
|
||||
subs $inner,$inner,#8
|
||||
vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
|
||||
|
||||
bne .LNEON_tail
|
||||
|
||||
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
|
||||
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
|
||||
subs $aptr,sp,#0 @ clear carry flag
|
||||
add $bptr,sp,$num,lsl#2
|
||||
|
||||
.LNEON_sub:
|
||||
ldmia $aptr!, {r4-r7}
|
||||
ldmia $nptr!, {r8-r11}
|
||||
sbcs r8, r4,r8
|
||||
sbcs r9, r5,r9
|
||||
sbcs r10,r6,r10
|
||||
sbcs r11,r7,r11
|
||||
teq $aptr,$bptr @ preserves carry
|
||||
stmia $rptr!, {r8-r11}
|
||||
bne .LNEON_sub
|
||||
|
||||
ldr r10, [$aptr] @ load top-most bit
|
||||
veor q0,q0,q0
|
||||
sub r11,$bptr,sp @ this is num*4
|
||||
veor q1,q1,q1
|
||||
mov $aptr,sp
|
||||
sub $rptr,$rptr,r11 @ rewind $rptr
|
||||
mov $nptr,$bptr @ second 3/4th of frame
|
||||
sbcs r10,r10,#0 @ result is carry flag
|
||||
|
||||
.LNEON_copy_n_zap:
|
||||
ldmia $aptr!, {r4-r7}
|
||||
ldmia $rptr, {r8-r11}
|
||||
movcc r8, r4
|
||||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||||
movcc r9, r5
|
||||
movcc r10,r6
|
||||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||||
movcc r11,r7
|
||||
ldmia $aptr, {r4-r7}
|
||||
stmia $rptr!, {r8-r11}
|
||||
sub $aptr,$aptr,#16
|
||||
ldmia $rptr, {r8-r11}
|
||||
movcc r8, r4
|
||||
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
|
||||
movcc r9, r5
|
||||
movcc r10,r6
|
||||
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
|
||||
movcc r11,r7
|
||||
teq $aptr,$bptr @ preserves carry
|
||||
stmia $rptr!, {r8-r11}
|
||||
bne .LNEON_copy_n_zap
|
||||
|
||||
sub sp,ip,#96
|
||||
vldmia sp!,{d8-d15}
|
||||
ldmia sp!,{r4-r11}
|
||||
ret @ bx lr
|
||||
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
|
||||
#endif
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
#endif
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||||
$code =~ s/\bret\b/bx lr/gm;
|
||||
print $code;
|
||||
close STDOUT;
|
||||
774
openssl-1.0.2f/crypto/bn/asm/bn-586.pl
Normal file
774
openssl-1.0.2f/crypto/bn/asm/bn-586.pl
Normal file
@@ -0,0 +1,774 @@
|
||||
#!/usr/local/bin/perl
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
$sse2=0;
|
||||
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
|
||||
&external_label("OPENSSL_ia32cap_P") if ($sse2);
|
||||
|
||||
&bn_mul_add_words("bn_mul_add_words");
|
||||
&bn_mul_words("bn_mul_words");
|
||||
&bn_sqr_words("bn_sqr_words");
|
||||
&bn_div_words("bn_div_words");
|
||||
&bn_add_words("bn_add_words");
|
||||
&bn_sub_words("bn_sub_words");
|
||||
&bn_sub_part_words("bn_sub_part_words");
|
||||
|
||||
&asm_finish();
|
||||
|
||||
sub bn_mul_add_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
|
||||
|
||||
$r="eax";
|
||||
$a="edx";
|
||||
$c="ecx";
|
||||
|
||||
if ($sse2) {
|
||||
&picmeup("eax","OPENSSL_ia32cap_P");
|
||||
&bt(&DWP(0,"eax"),26);
|
||||
&jnc(&label("maw_non_sse2"));
|
||||
|
||||
&mov($r,&wparam(0));
|
||||
&mov($a,&wparam(1));
|
||||
&mov($c,&wparam(2));
|
||||
&movd("mm0",&wparam(3)); # mm0 = w
|
||||
&pxor("mm1","mm1"); # mm1 = carry_in
|
||||
&jmp(&label("maw_sse2_entry"));
|
||||
|
||||
&set_label("maw_sse2_unrolled",16);
|
||||
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
|
||||
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
|
||||
&movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
|
||||
&pmuludq("mm2","mm0"); # mm2 = w*a[0]
|
||||
&movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
|
||||
&pmuludq("mm4","mm0"); # mm4 = w*a[1]
|
||||
&movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
|
||||
&pmuludq("mm6","mm0"); # mm6 = w*a[2]
|
||||
&movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
|
||||
&pmuludq("mm7","mm0"); # mm7 = w*a[3]
|
||||
&paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
|
||||
&movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
|
||||
&paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
|
||||
&movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
|
||||
&paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
|
||||
&movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
|
||||
&paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
|
||||
&movd(&DWP(0,$r,"",0),"mm1");
|
||||
&movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
|
||||
&pmuludq("mm2","mm0"); # mm2 = w*a[4]
|
||||
&psrlq("mm1",32); # mm1 = carry0
|
||||
&movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
|
||||
&pmuludq("mm4","mm0"); # mm4 = w*a[5]
|
||||
&paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
|
||||
&movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
|
||||
&pmuludq("mm6","mm0"); # mm6 = w*a[6]
|
||||
&movd(&DWP(4,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry1
|
||||
&movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
|
||||
&add($a,32);
|
||||
&pmuludq("mm3","mm0"); # mm3 = w*a[7]
|
||||
&paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
|
||||
&movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
|
||||
&paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
|
||||
&movd(&DWP(8,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry2
|
||||
&paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
|
||||
&movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
|
||||
&paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
|
||||
&movd(&DWP(12,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry3
|
||||
&paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
|
||||
&movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
|
||||
&paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
|
||||
&movd(&DWP(16,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry4
|
||||
&paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
|
||||
&movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
|
||||
&paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
|
||||
&movd(&DWP(20,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry5
|
||||
&paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
|
||||
&movd(&DWP(24,$r,"",0),"mm1");
|
||||
&psrlq("mm1",32); # mm1 = carry6
|
||||
&paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
|
||||
&movd(&DWP(28,$r,"",0),"mm1");
|
||||
&lea($r,&DWP(32,$r));
|
||||
&psrlq("mm1",32); # mm1 = carry_out
|
||||
|
||||
&sub($c,8);
|
||||
&jz(&label("maw_sse2_exit"));
|
||||
&set_label("maw_sse2_entry");
|
||||
&test($c,0xfffffff8);
|
||||
&jnz(&label("maw_sse2_unrolled"));
|
||||
|
||||
&set_label("maw_sse2_loop",4);
|
||||
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
|
||||
&movd("mm3",&DWP(0,$r)); # mm3 = r[i]
|
||||
&pmuludq("mm2","mm0"); # a[i] *= w
|
||||
&lea($a,&DWP(4,$a));
|
||||
&paddq("mm1","mm3"); # carry += r[i]
|
||||
&paddq("mm1","mm2"); # carry += a[i]*w
|
||||
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
|
||||
&sub($c,1);
|
||||
&psrlq("mm1",32); # carry = carry_high
|
||||
&lea($r,&DWP(4,$r));
|
||||
&jnz(&label("maw_sse2_loop"));
|
||||
&set_label("maw_sse2_exit");
|
||||
&movd("eax","mm1"); # c = carry_out
|
||||
&emms();
|
||||
&ret();
|
||||
|
||||
&set_label("maw_non_sse2",16);
|
||||
}
|
||||
|
||||
# function_begin prologue
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
|
||||
&comment("");
|
||||
$Low="eax";
|
||||
$High="edx";
|
||||
$a="ebx";
|
||||
$w="ebp";
|
||||
$r="edi";
|
||||
$c="esi";
|
||||
|
||||
&xor($c,$c); # clear carry
|
||||
&mov($r,&wparam(0)); #
|
||||
|
||||
&mov("ecx",&wparam(2)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
|
||||
&and("ecx",0xfffffff8); # num / 8
|
||||
&mov($w,&wparam(3)); #
|
||||
|
||||
&push("ecx"); # Up the stack for a tmp variable
|
||||
|
||||
&jz(&label("maw_finish"));
|
||||
|
||||
&set_label("maw_loop",16);
|
||||
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov("eax",&DWP($i,$a)); # *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+= c
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&add("eax",&DWP($i,$r)); # L(t)+= *r
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i,$r),"eax"); # *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&sub("ecx",8);
|
||||
&lea($a,&DWP(32,$a));
|
||||
&lea($r,&DWP(32,$r));
|
||||
&jnz(&label("maw_loop"));
|
||||
|
||||
&set_label("maw_finish",0);
|
||||
&mov("ecx",&wparam(2)); # get num
|
||||
&and("ecx",7);
|
||||
&jnz(&label("maw_finish2")); # helps branch prediction
|
||||
&jmp(&label("maw_end"));
|
||||
|
||||
&set_label("maw_finish2",1);
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a)); # *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&add("eax",&DWP($i*4,$r)); # L(t)+= *r
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&dec("ecx") if ($i != 7-1);
|
||||
&mov(&DWP($i*4,$r),"eax"); # *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
&jz(&label("maw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("maw_end",0);
|
||||
&mov("eax",$c);
|
||||
|
||||
&pop("ecx"); # clear variable from
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub bn_mul_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
|
||||
|
||||
$r="eax";
|
||||
$a="edx";
|
||||
$c="ecx";
|
||||
|
||||
if ($sse2) {
|
||||
&picmeup("eax","OPENSSL_ia32cap_P");
|
||||
&bt(&DWP(0,"eax"),26);
|
||||
&jnc(&label("mw_non_sse2"));
|
||||
|
||||
&mov($r,&wparam(0));
|
||||
&mov($a,&wparam(1));
|
||||
&mov($c,&wparam(2));
|
||||
&movd("mm0",&wparam(3)); # mm0 = w
|
||||
&pxor("mm1","mm1"); # mm1 = carry = 0
|
||||
|
||||
&set_label("mw_sse2_loop",16);
|
||||
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
|
||||
&pmuludq("mm2","mm0"); # a[i] *= w
|
||||
&lea($a,&DWP(4,$a));
|
||||
&paddq("mm1","mm2"); # carry += a[i]*w
|
||||
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
|
||||
&sub($c,1);
|
||||
&psrlq("mm1",32); # carry = carry_high
|
||||
&lea($r,&DWP(4,$r));
|
||||
&jnz(&label("mw_sse2_loop"));
|
||||
|
||||
&movd("eax","mm1"); # return carry
|
||||
&emms();
|
||||
&ret();
|
||||
&set_label("mw_non_sse2",16);
|
||||
}
|
||||
|
||||
# function_begin prologue
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
|
||||
&comment("");
|
||||
$Low="eax";
|
||||
$High="edx";
|
||||
$a="ebx";
|
||||
$w="ecx";
|
||||
$r="edi";
|
||||
$c="esi";
|
||||
$num="ebp";
|
||||
|
||||
&xor($c,$c); # clear carry
|
||||
&mov($r,&wparam(0)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
&mov($num,&wparam(2)); #
|
||||
&mov($w,&wparam(3)); #
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("mw_finish"));
|
||||
|
||||
&set_label("mw_loop",0);
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov("eax",&DWP($i,$a,"",0)); # *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
# XXX
|
||||
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
|
||||
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jz(&label("mw_finish"));
|
||||
&jmp(&label("mw_loop"));
|
||||
|
||||
&set_label("mw_finish",0);
|
||||
&mov($num,&wparam(2)); # get num
|
||||
&and($num,7);
|
||||
&jnz(&label("mw_finish2"));
|
||||
&jmp(&label("mw_end"));
|
||||
|
||||
&set_label("mw_finish2",1);
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a,"",0));# *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
# XXX
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
&dec($num) if ($i != 7-1);
|
||||
&jz(&label("mw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("mw_end",0);
|
||||
&mov("eax",$c);
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub bn_sqr_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
|
||||
|
||||
$r="eax";
|
||||
$a="edx";
|
||||
$c="ecx";
|
||||
|
||||
if ($sse2) {
|
||||
&picmeup("eax","OPENSSL_ia32cap_P");
|
||||
&bt(&DWP(0,"eax"),26);
|
||||
&jnc(&label("sqr_non_sse2"));
|
||||
|
||||
&mov($r,&wparam(0));
|
||||
&mov($a,&wparam(1));
|
||||
&mov($c,&wparam(2));
|
||||
|
||||
&set_label("sqr_sse2_loop",16);
|
||||
&movd("mm0",&DWP(0,$a)); # mm0 = a[i]
|
||||
&pmuludq("mm0","mm0"); # a[i] *= a[i]
|
||||
&lea($a,&DWP(4,$a)); # a++
|
||||
&movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
|
||||
&sub($c,1);
|
||||
&lea($r,&DWP(8,$r)); # r += 2
|
||||
&jnz(&label("sqr_sse2_loop"));
|
||||
|
||||
&emms();
|
||||
&ret();
|
||||
&set_label("sqr_non_sse2",16);
|
||||
}
|
||||
|
||||
# function_begin prologue
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
|
||||
&comment("");
|
||||
$r="esi";
|
||||
$a="edi";
|
||||
$num="ebx";
|
||||
|
||||
&mov($r,&wparam(0)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
&mov($num,&wparam(2)); #
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("sw_finish"));
|
||||
|
||||
&set_label("sw_loop",0);
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
&mov("eax",&DWP($i,$a,"",0)); # *a
|
||||
# XXX
|
||||
&mul("eax"); # *a * *a
|
||||
&mov(&DWP($i*2,$r,"",0),"eax"); #
|
||||
&mov(&DWP($i*2+4,$r,"",0),"edx");#
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,64);
|
||||
&sub($num,8);
|
||||
&jnz(&label("sw_loop"));
|
||||
|
||||
&set_label("sw_finish",0);
|
||||
&mov($num,&wparam(2)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("sw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a,"",0)); # *a
|
||||
# XXX
|
||||
&mul("eax"); # *a * *a
|
||||
&mov(&DWP($i*8,$r,"",0),"eax"); #
|
||||
&dec($num) if ($i != 7-1);
|
||||
&mov(&DWP($i*8+4,$r,"",0),"edx");
|
||||
&jz(&label("sw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("sw_end",0);
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub bn_div_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin_B($name,"");
|
||||
&mov("edx",&wparam(0)); #
|
||||
&mov("eax",&wparam(1)); #
|
||||
&mov("ecx",&wparam(2)); #
|
||||
&div("ecx");
|
||||
&ret();
|
||||
&function_end_B($name);
|
||||
}
|
||||
|
||||
sub bn_add_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
$c="eax";
|
||||
$r="ebx";
|
||||
$tmp1="ecx";
|
||||
$tmp2="edx";
|
||||
$num="ebp";
|
||||
|
||||
&mov($r,&wparam(0)); # get r
|
||||
&mov($a,&wparam(1)); # get a
|
||||
&mov($b,&wparam(2)); # get b
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&xor($c,$c); # clear carry
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
|
||||
&jz(&label("aw_finish"));
|
||||
|
||||
&set_label("aw_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&add($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&add($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("aw_loop"));
|
||||
|
||||
&set_label("aw_finish",0);
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("aw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
|
||||
&add($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&add($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jz(&label("aw_end")) if ($i != 6);
|
||||
}
|
||||
&set_label("aw_end",0);
|
||||
|
||||
# &mov("eax",$c); # $c is "eax"
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub bn_sub_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
$c="eax";
|
||||
$r="ebx";
|
||||
$tmp1="ecx";
|
||||
$tmp2="edx";
|
||||
$num="ebp";
|
||||
|
||||
&mov($r,&wparam(0)); # get r
|
||||
&mov($a,&wparam(1)); # get a
|
||||
&mov($b,&wparam(2)); # get b
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&xor($c,$c); # clear carry
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
|
||||
&jz(&label("aw_finish"));
|
||||
|
||||
&set_label("aw_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("aw_loop"));
|
||||
|
||||
&set_label("aw_finish",0);
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("aw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jz(&label("aw_end")) if ($i != 6);
|
||||
}
|
||||
&set_label("aw_end",0);
|
||||
|
||||
# &mov("eax",$c); # $c is "eax"
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
sub bn_sub_part_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
$c="eax";
|
||||
$r="ebx";
|
||||
$tmp1="ecx";
|
||||
$tmp2="edx";
|
||||
$num="ebp";
|
||||
|
||||
&mov($r,&wparam(0)); # get r
|
||||
&mov($a,&wparam(1)); # get a
|
||||
&mov($b,&wparam(2)); # get b
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&xor($c,$c); # clear carry
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
|
||||
&jz(&label("aw_finish"));
|
||||
|
||||
&set_label("aw_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("aw_loop"));
|
||||
|
||||
&set_label("aw_finish",0);
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("aw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov($tmp1,&DWP(0,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP(0,$b,"",0));# *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP(0,$r,"",0),$tmp1); # *r
|
||||
&add($a, 4);
|
||||
&add($b, 4);
|
||||
&add($r, 4);
|
||||
&dec($num) if ($i != 6);
|
||||
&jz(&label("aw_end")) if ($i != 6);
|
||||
}
|
||||
&set_label("aw_end",0);
|
||||
|
||||
&cmp(&wparam(4),0);
|
||||
&je(&label("pw_end"));
|
||||
|
||||
&mov($num,&wparam(4)); # get dl
|
||||
&cmp($num,0);
|
||||
&je(&label("pw_end"));
|
||||
&jge(&label("pw_pos"));
|
||||
|
||||
&comment("pw_neg");
|
||||
&mov($tmp2,0);
|
||||
&sub($tmp2,$num);
|
||||
&mov($num,$tmp2);
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("pw_neg_finish"));
|
||||
|
||||
&set_label("pw_neg_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("dl<0 Round $i");
|
||||
|
||||
&mov($tmp1,0);
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_neg_loop"));
|
||||
|
||||
&set_label("pw_neg_finish",0);
|
||||
&mov($tmp2,&wparam(4)); # get dl
|
||||
&mov($num,0);
|
||||
&sub($num,$tmp2);
|
||||
&and($num,7);
|
||||
&jz(&label("pw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("dl<0 Tail Round $i");
|
||||
&mov($tmp1,0);
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jz(&label("pw_end")) if ($i != 6);
|
||||
}
|
||||
|
||||
&jmp(&label("pw_end"));
|
||||
|
||||
&set_label("pw_pos",0);
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("pw_pos_finish"));
|
||||
|
||||
&set_label("pw_pos_loop",0);
|
||||
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("dl>0 Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&sub($tmp1,$c);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jnc(&label("pw_nc".$i));
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_pos_loop"));
|
||||
|
||||
&set_label("pw_pos_finish",0);
|
||||
&mov($num,&wparam(4)); # get dl
|
||||
&and($num,7);
|
||||
&jz(&label("pw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("dl>0 Tail Round $i");
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&sub($tmp1,$c);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&jnc(&label("pw_tail_nc".$i));
|
||||
&dec($num) if ($i != 6);
|
||||
&jz(&label("pw_end")) if ($i != 6);
|
||||
}
|
||||
&mov($c,1);
|
||||
&jmp(&label("pw_end"));
|
||||
|
||||
&set_label("pw_nc_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&set_label("pw_nc".$i,0);
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("pw_nc_loop"));
|
||||
|
||||
&mov($num,&wparam(4)); # get dl
|
||||
&and($num,7);
|
||||
&jz(&label("pw_nc_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
&set_label("pw_tail_nc".$i,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&jz(&label("pw_nc_end")) if ($i != 6);
|
||||
}
|
||||
|
||||
&set_label("pw_nc_end",0);
|
||||
&mov($c,0);
|
||||
|
||||
&set_label("pw_end",0);
|
||||
|
||||
# &mov("eax",$c); # $c is "eax"
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
287
openssl-1.0.2f/crypto/bn/asm/co-586.pl
Normal file
287
openssl-1.0.2f/crypto/bn/asm/co-586.pl
Normal file
@@ -0,0 +1,287 @@
|
||||
#!/usr/local/bin/perl
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
&bn_mul_comba("bn_mul_comba8",8);
|
||||
&bn_mul_comba("bn_mul_comba4",4);
|
||||
&bn_sqr_comba("bn_sqr_comba8",8);
|
||||
&bn_sqr_comba("bn_sqr_comba4",4);
|
||||
|
||||
&asm_finish();
|
||||
|
||||
sub mul_add_c
|
||||
{
|
||||
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("mul a[$ai]*b[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$b,"",0));
|
||||
|
||||
&mul("edx");
|
||||
&add($c0,"eax");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
|
||||
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
|
||||
###
|
||||
&adc($c1,"edx");
|
||||
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
|
||||
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
|
||||
}
|
||||
|
||||
sub sqr_add_c
|
||||
{
|
||||
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("sqr a[$ai]*a[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$b,"",0));
|
||||
|
||||
if ($ai == $bi)
|
||||
{ &mul("eax");}
|
||||
else
|
||||
{ &mul("edx");}
|
||||
&add($c0,"eax");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
|
||||
###
|
||||
&adc($c1,"edx");
|
||||
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
|
||||
}
|
||||
|
||||
sub sqr_add_c2
|
||||
{
|
||||
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("sqr a[$ai]*a[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$a,"",0));
|
||||
|
||||
if ($ai == $bi)
|
||||
{ &mul("eax");}
|
||||
else
|
||||
{ &mul("edx");}
|
||||
&add("eax","eax");
|
||||
###
|
||||
&adc("edx","edx");
|
||||
###
|
||||
&adc($c2,0);
|
||||
&add($c0,"eax");
|
||||
&adc($c1,"edx");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
|
||||
&adc($c2,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
|
||||
###
|
||||
}
|
||||
|
||||
sub bn_mul_comba
|
||||
{
|
||||
local($name,$num)=@_;
|
||||
local($a,$b,$c0,$c1,$c2);
|
||||
local($i,$as,$ae,$bs,$be,$ai,$bi);
|
||||
local($tot,$end);
|
||||
|
||||
&function_begin_B($name,"");
|
||||
|
||||
$c0="ebx";
|
||||
$c1="ecx";
|
||||
$c2="ebp";
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
|
||||
$as=0;
|
||||
$ae=0;
|
||||
$bs=0;
|
||||
$be=0;
|
||||
$tot=$num+$num-1;
|
||||
|
||||
&push("esi");
|
||||
&mov($a,&wparam(1));
|
||||
&push("edi");
|
||||
&mov($b,&wparam(2));
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
|
||||
&xor($c0,$c0);
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
&xor($c1,$c1);
|
||||
&mov("edx",&DWP(0,$b,"",0)); # load the first second
|
||||
|
||||
for ($i=0; $i<$tot; $i++)
|
||||
{
|
||||
$ai=$as;
|
||||
$bi=$bs;
|
||||
$end=$be+1;
|
||||
|
||||
&comment("################## Calculate word $i");
|
||||
|
||||
for ($j=$bs; $j<$end; $j++)
|
||||
{
|
||||
&xor($c2,$c2) if ($j == $bs);
|
||||
if (($j+1) == $end)
|
||||
{
|
||||
$v=1;
|
||||
$v=2 if (($i+1) == $tot);
|
||||
}
|
||||
else
|
||||
{ $v=0; }
|
||||
if (($j+1) != $end)
|
||||
{
|
||||
$na=($ai-1);
|
||||
$nb=($bi+1);
|
||||
}
|
||||
else
|
||||
{
|
||||
$na=$as+($i < ($num-1));
|
||||
$nb=$bs+($i >= ($num-1));
|
||||
}
|
||||
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
|
||||
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
if ($v)
|
||||
{
|
||||
&comment("saved r[$i]");
|
||||
# &mov("eax",&wparam(0));
|
||||
# &mov(&DWP($i*4,"eax","",0),$c0);
|
||||
($c0,$c1,$c2)=($c1,$c2,$c0);
|
||||
}
|
||||
$ai--;
|
||||
$bi++;
|
||||
}
|
||||
$as++ if ($i < ($num-1));
|
||||
$ae++ if ($i >= ($num-1));
|
||||
|
||||
$bs++ if ($i >= ($num-1));
|
||||
$be++ if ($i < ($num-1));
|
||||
}
|
||||
&comment("save r[$i]");
|
||||
# &mov("eax",&wparam(0));
|
||||
&mov(&DWP($i*4,"eax","",0),$c0);
|
||||
|
||||
&pop("ebx");
|
||||
&pop("ebp");
|
||||
&pop("edi");
|
||||
&pop("esi");
|
||||
&ret();
|
||||
&function_end_B($name);
|
||||
}
|
||||
|
||||
sub bn_sqr_comba
|
||||
{
|
||||
local($name,$num)=@_;
|
||||
local($r,$a,$c0,$c1,$c2)=@_;
|
||||
local($i,$as,$ae,$bs,$be,$ai,$bi);
|
||||
local($b,$tot,$end,$half);
|
||||
|
||||
&function_begin_B($name,"");
|
||||
|
||||
$c0="ebx";
|
||||
$c1="ecx";
|
||||
$c2="ebp";
|
||||
$a="esi";
|
||||
$r="edi";
|
||||
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&mov($r,&wparam(0));
|
||||
&mov($a,&wparam(1));
|
||||
&xor($c0,$c0);
|
||||
&xor($c1,$c1);
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
|
||||
$as=0;
|
||||
$ae=0;
|
||||
$bs=0;
|
||||
$be=0;
|
||||
$tot=$num+$num-1;
|
||||
|
||||
for ($i=0; $i<$tot; $i++)
|
||||
{
|
||||
$ai=$as;
|
||||
$bi=$bs;
|
||||
$end=$be+1;
|
||||
|
||||
&comment("############### Calculate word $i");
|
||||
for ($j=$bs; $j<$end; $j++)
|
||||
{
|
||||
&xor($c2,$c2) if ($j == $bs);
|
||||
if (($ai-1) < ($bi+1))
|
||||
{
|
||||
$v=1;
|
||||
$v=2 if ($i+1) == $tot;
|
||||
}
|
||||
else
|
||||
{ $v=0; }
|
||||
if (!$v)
|
||||
{
|
||||
$na=$ai-1;
|
||||
$nb=$bi+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
$na=$as+($i < ($num-1));
|
||||
$nb=$bs+($i >= ($num-1));
|
||||
}
|
||||
if ($ai == $bi)
|
||||
{
|
||||
&sqr_add_c($r,$a,$ai,$bi,
|
||||
$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
}
|
||||
else
|
||||
{
|
||||
&sqr_add_c2($r,$a,$ai,$bi,
|
||||
$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
}
|
||||
if ($v)
|
||||
{
|
||||
&comment("saved r[$i]");
|
||||
#&mov(&DWP($i*4,$r,"",0),$c0);
|
||||
($c0,$c1,$c2)=($c1,$c2,$c0);
|
||||
last;
|
||||
}
|
||||
$ai--;
|
||||
$bi++;
|
||||
}
|
||||
$as++ if ($i < ($num-1));
|
||||
$ae++ if ($i >= ($num-1));
|
||||
|
||||
$bs++ if ($i >= ($num-1));
|
||||
$be++ if ($i < ($num-1));
|
||||
}
|
||||
&mov(&DWP($i*4,$r,"",0),$c0);
|
||||
&pop("ebx");
|
||||
&pop("ebp");
|
||||
&pop("edi");
|
||||
&pop("esi");
|
||||
&ret();
|
||||
&function_end_B($name);
|
||||
}
|
||||
851
openssl-1.0.2f/crypto/bn/asm/ia64-mont.pl
Normal file
851
openssl-1.0.2f/crypto/bn/asm/ia64-mont.pl
Normal file
@@ -0,0 +1,851 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# January 2010
|
||||
#
|
||||
# "Teaser" Montgomery multiplication module for IA-64. There are
|
||||
# several possibilities for improvement:
|
||||
#
|
||||
# - modulo-scheduling outer loop would eliminate quite a number of
|
||||
# stalls after ldf8, xma and getf.sig outside inner loop and
|
||||
# improve shorter key performance;
|
||||
# - shorter vector support [with input vectors being fetched only
|
||||
# once] should be added;
|
||||
# - 2x unroll with help of n0[1] would make the code scalable on
|
||||
# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
|
||||
# acute interest, because upcoming Tukwila's individual cores are
|
||||
# reportedly based on Itanium 2 design;
|
||||
# - dedicated squaring procedure(?);
|
||||
#
|
||||
# January 2010
|
||||
#
|
||||
# Shorter vector support is implemented by zero-padding ap and np
|
||||
# vectors up to 8 elements, or 512 bits. This means that 256-bit
|
||||
# inputs will be processed only 2 times faster than 512-bit inputs,
|
||||
# not 4 [as one would expect, because algorithm complexity is n^2].
|
||||
# The reason for padding is that inputs shorter than 512 bits won't
|
||||
# be processed faster anyway, because minimal critical path of the
|
||||
# core loop happens to match 512-bit timing. Either way, it resulted
|
||||
# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
|
||||
# 1024-bit one [in comparison to original version of *this* module].
|
||||
#
|
||||
# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
|
||||
# this module is:
|
||||
# sign verify sign/s verify/s
|
||||
# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
|
||||
# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
|
||||
# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
|
||||
# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
|
||||
# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
|
||||
# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
|
||||
# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
|
||||
#
|
||||
# ... and *without* (but still with ia64.S):
|
||||
#
|
||||
# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
|
||||
# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
|
||||
# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
|
||||
# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
|
||||
# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
|
||||
# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
|
||||
# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
|
||||
#
|
||||
# As it can be seen, RSA sign performance improves by 130-30%,
|
||||
# hereafter less for longer keys, while verify - by 74-13%.
|
||||
# DSA performance improves by 115-30%.
|
||||
|
||||
if ($^O eq "hpux") {
|
||||
$ADDP="addp4";
|
||||
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
|
||||
} else { $ADDP="add"; }
|
||||
|
||||
$code=<<___;
|
||||
.explicit
|
||||
.text
|
||||
|
||||
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
|
||||
// const BN_ULONG *bp,const BN_ULONG *np,
|
||||
// const BN_ULONG *n0p,int num);
|
||||
.align 64
|
||||
.global bn_mul_mont#
|
||||
.proc bn_mul_mont#
|
||||
bn_mul_mont:
|
||||
.prologue
|
||||
.body
|
||||
{ .mmi; cmp4.le p6,p7=2,r37;;
|
||||
(p6) cmp4.lt.unc p8,p9=8,r37
|
||||
mov ret0=r0 };;
|
||||
{ .bbb;
|
||||
(p9) br.cond.dptk.many bn_mul_mont_8
|
||||
(p8) br.cond.dpnt.many bn_mul_mont_general
|
||||
(p7) br.ret.spnt.many b0 };;
|
||||
.endp bn_mul_mont#
|
||||
|
||||
prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
|
||||
|
||||
rptr=r8; aptr=r9; bptr=r14; nptr=r15;
|
||||
tptr=r16; // &tp[0]
|
||||
tp_1=r17; // &tp[-1]
|
||||
num=r18; len=r19; lc=r20;
|
||||
topbit=r21; // carry bit from tmp[num]
|
||||
|
||||
n0=f6;
|
||||
m0=f7;
|
||||
bi=f8;
|
||||
|
||||
.align 64
|
||||
.local bn_mul_mont_general#
|
||||
.proc bn_mul_mont_general#
|
||||
bn_mul_mont_general:
|
||||
.prologue
|
||||
{ .mmi; .save ar.pfs,prevfs
|
||||
alloc prevfs=ar.pfs,6,2,0,8
|
||||
$ADDP aptr=0,in1
|
||||
.save ar.lc,prevlc
|
||||
mov prevlc=ar.lc }
|
||||
{ .mmi; .vframe prevsp
|
||||
mov prevsp=sp
|
||||
$ADDP bptr=0,in2
|
||||
.save pr,prevpr
|
||||
mov prevpr=pr };;
|
||||
|
||||
.body
|
||||
.rotf alo[6],nlo[4],ahi[8],nhi[6]
|
||||
.rotr a[3],n[3],t[2]
|
||||
|
||||
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
|
||||
ldf8 alo[4]=[aptr],16 // ap[0]
|
||||
$ADDP r30=8,in1 };;
|
||||
{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
|
||||
ldf8 alo[2]=[aptr],16 // ap[2]
|
||||
$ADDP in4=0,in4 };;
|
||||
{ .mmi; ldf8 alo[1]=[r30] // ap[3]
|
||||
ldf8 n0=[in4] // n0
|
||||
$ADDP rptr=0,in0 }
|
||||
{ .mmi; $ADDP nptr=0,in3
|
||||
mov r31=16
|
||||
zxt4 num=in5 };;
|
||||
{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
|
||||
shladd len=num,3,r0
|
||||
shladd r31=num,3,r31 };;
|
||||
{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
|
||||
add lc=-5,num
|
||||
sub r31=sp,r31 };;
|
||||
{ .mfb; and sp=-16,r31 // alloca
|
||||
xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
|
||||
nop.b 0 }
|
||||
{ .mfb; nop.m 0
|
||||
xmpy.lu alo[4]=alo[4],bi
|
||||
brp.loop.imp .L1st_ctop,.L1st_cend-16
|
||||
};;
|
||||
{ .mfi; nop.m 0
|
||||
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
|
||||
add tp_1=8,sp }
|
||||
{ .mfi; nop.m 0
|
||||
xma.lu alo[3]=alo[3],bi,ahi[2]
|
||||
mov pr.rot=0x20001f<<16
|
||||
// ------^----- (p40) at first (p23)
|
||||
// ----------^^ p[16:20]=1
|
||||
};;
|
||||
{ .mfi; nop.m 0
|
||||
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
|
||||
mov ar.lc=lc }
|
||||
{ .mfi; nop.m 0
|
||||
fcvt.fxu.s1 nhi[1]=f0
|
||||
mov ar.ec=8 };;
|
||||
|
||||
.align 32
|
||||
.L1st_ctop:
|
||||
.pred.rel "mutex",p40,p42
|
||||
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
|
||||
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
|
||||
(p40) add n[2]=n[2],a[2] } // (p23) }
|
||||
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
|
||||
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
|
||||
(p42) add n[2]=n[2],a[2],1 };; // (p23)
|
||||
{ .mfi; (p21) getf.sig a[0]=alo[5]
|
||||
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
|
||||
(p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
|
||||
{ .mfi; (p23) st8 [tp_1]=n[2],8
|
||||
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
|
||||
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
|
||||
{ .mmb; (p21) getf.sig n[0]=nlo[3]
|
||||
(p16) nop.m 0
|
||||
br.ctop.sptk .L1st_ctop };;
|
||||
.L1st_cend:
|
||||
|
||||
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
|
||||
getf.sig n[0]=nhi[4]
|
||||
add num=-1,num };; // num--
|
||||
{ .mmi; .pred.rel "mutex",p40,p42
|
||||
(p40) add n[0]=n[0],a[0]
|
||||
(p42) add n[0]=n[0],a[0],1
|
||||
sub aptr=aptr,len };; // rewind
|
||||
{ .mmi; .pred.rel "mutex",p40,p42
|
||||
(p40) cmp.ltu p41,p39=n[0],a[0]
|
||||
(p42) cmp.leu p41,p39=n[0],a[0]
|
||||
sub nptr=nptr,len };;
|
||||
{ .mmi; .pred.rel "mutex",p39,p41
|
||||
(p39) add topbit=r0,r0
|
||||
(p41) add topbit=r0,r0,1
|
||||
nop.i 0 }
|
||||
{ .mmi; st8 [tp_1]=n[0]
|
||||
add tptr=16,sp
|
||||
add tp_1=8,sp };;
|
||||
|
||||
.Louter:
|
||||
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
|
||||
ldf8 ahi[3]=[tptr] // tp[0]
|
||||
add r30=8,aptr };;
|
||||
{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
|
||||
ldf8 alo[3]=[r30],16 // ap[1]
|
||||
add r31=8,nptr };;
|
||||
{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
|
||||
xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
|
||||
brp.loop.imp .Linner_ctop,.Linner_cend-16
|
||||
}
|
||||
{ .mfb; ldf8 alo[1]=[r30] // ap[3]
|
||||
xma.lu alo[4]=alo[4],bi,ahi[3]
|
||||
clrrrb.pr };;
|
||||
{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
|
||||
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
|
||||
nop.i 0 }
|
||||
{ .mfi; ldf8 nlo[1]=[r31] // np[1]
|
||||
xma.lu alo[3]=alo[3],bi,ahi[2]
|
||||
mov pr.rot=0x20101f<<16
|
||||
// ------^----- (p40) at first (p23)
|
||||
// --------^--- (p30) at first (p22)
|
||||
// ----------^^ p[16:20]=1
|
||||
};;
|
||||
{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
|
||||
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
|
||||
mov ar.lc=lc }
|
||||
{ .mfi;
|
||||
fcvt.fxu.s1 nhi[1]=f0
|
||||
mov ar.ec=8 };;
|
||||
|
||||
// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
|
||||
// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
|
||||
// in latter case accounts for two-tick pipeline stall, which means
|
||||
// that its performance would be ~20% lower than optimal one. No
|
||||
// attempt was made to address this, because original Itanium is
|
||||
// hardly represented out in the wild...
|
||||
.align 32
|
||||
.Linner_ctop:
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p30,p32
|
||||
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
|
||||
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
|
||||
(p40) add n[2]=n[2],a[2] } // (p23)
|
||||
{ .mfi; (p16) nop.m 0
|
||||
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
|
||||
(p42) add n[2]=n[2],a[2],1 };; // (p23)
|
||||
{ .mfi; (p21) getf.sig a[0]=alo[5]
|
||||
(p16) nop.f 0
|
||||
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
|
||||
{ .mfi; (p21) ld8 t[0]=[tptr],8
|
||||
(p16) nop.f 0
|
||||
(p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
|
||||
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
|
||||
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
|
||||
(p30) add a[1]=a[1],t[1] } // (p22)
|
||||
{ .mfi; (p16) nop.m 0
|
||||
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
|
||||
(p32) add a[1]=a[1],t[1],1 };; // (p22)
|
||||
{ .mmi; (p21) getf.sig n[0]=nlo[3]
|
||||
(p16) nop.m 0
|
||||
(p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
|
||||
{ .mmb; (p23) st8 [tp_1]=n[2],8
|
||||
(p32) cmp.leu p31,p29=a[1],t[1] // (p22)
|
||||
br.ctop.sptk .Linner_ctop };;
|
||||
.Linner_cend:
|
||||
|
||||
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
|
||||
getf.sig n[0]=nhi[4]
|
||||
nop.i 0 };;
|
||||
|
||||
{ .mmi; .pred.rel "mutex",p31,p33
|
||||
(p31) add a[0]=a[0],topbit
|
||||
(p33) add a[0]=a[0],topbit,1
|
||||
mov topbit=r0 };;
|
||||
{ .mfi; .pred.rel "mutex",p31,p33
|
||||
(p31) cmp.ltu p32,p30=a[0],topbit
|
||||
(p33) cmp.leu p32,p30=a[0],topbit
|
||||
}
|
||||
{ .mfi; .pred.rel "mutex",p40,p42
|
||||
(p40) add n[0]=n[0],a[0]
|
||||
(p42) add n[0]=n[0],a[0],1
|
||||
};;
|
||||
{ .mmi; .pred.rel "mutex",p44,p46
|
||||
(p40) cmp.ltu p41,p39=n[0],a[0]
|
||||
(p42) cmp.leu p41,p39=n[0],a[0]
|
||||
(p32) add topbit=r0,r0,1 }
|
||||
|
||||
{ .mmi; st8 [tp_1]=n[0],8
|
||||
cmp4.ne p6,p0=1,num
|
||||
sub aptr=aptr,len };; // rewind
|
||||
{ .mmi; sub nptr=nptr,len
|
||||
(p41) add topbit=r0,r0,1
|
||||
add tptr=16,sp }
|
||||
{ .mmb; add tp_1=8,sp
|
||||
add num=-1,num // num--
|
||||
(p6) br.cond.sptk.many .Louter };;
|
||||
|
||||
{ .mbb; add lc=4,lc
|
||||
brp.loop.imp .Lsub_ctop,.Lsub_cend-16
|
||||
clrrrb.pr };;
|
||||
{ .mii; nop.m 0
|
||||
mov pr.rot=0x10001<<16
|
||||
// ------^---- (p33) at first (p17)
|
||||
mov ar.lc=lc }
|
||||
{ .mii; nop.m 0
|
||||
mov ar.ec=3
|
||||
nop.i 0 };;
|
||||
|
||||
.Lsub_ctop:
|
||||
.pred.rel "mutex",p33,p35
|
||||
{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
|
||||
(p16) nop.f 0
|
||||
(p33) sub n[1]=t[1],n[1] } // (p17)
|
||||
{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
|
||||
(p16) nop.f 0
|
||||
(p35) sub n[1]=t[1],n[1],1 };; // (p17)
|
||||
{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
|
||||
(p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
|
||||
(p18) nop.b 0 }
|
||||
{ .mib; (p18) nop.m 0
|
||||
(p35) cmp.geu p34,p32=n[1],t[1] // (p17)
|
||||
br.ctop.sptk .Lsub_ctop };;
|
||||
.Lsub_cend:
|
||||
|
||||
{ .mmb; .pred.rel "mutex",p34,p36
|
||||
(p34) sub topbit=topbit,r0 // (p19)
|
||||
(p36) sub topbit=topbit,r0,1
|
||||
brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
|
||||
}
|
||||
{ .mmb; sub rptr=rptr,len // rewind
|
||||
sub tptr=tptr,len
|
||||
clrrrb.pr };;
|
||||
{ .mmi; and aptr=tptr,topbit
|
||||
andcm bptr=rptr,topbit
|
||||
mov pr.rot=1<<16 };;
|
||||
{ .mii; or nptr=aptr,bptr
|
||||
mov ar.lc=lc
|
||||
mov ar.ec=3 };;
|
||||
|
||||
.Lcopy_ctop:
|
||||
{ .mmb; (p16) ld8 n[0]=[nptr],8
|
||||
(p18) st8 [tptr]=r0,8
|
||||
(p16) nop.b 0 }
|
||||
{ .mmb; (p16) nop.m 0
|
||||
(p18) st8 [rptr]=n[2],8
|
||||
br.ctop.sptk .Lcopy_ctop };;
|
||||
.Lcopy_cend:
|
||||
|
||||
{ .mmi; mov ret0=1 // signal "handled"
|
||||
rum 1<<5 // clear um.mfh
|
||||
mov ar.lc=prevlc }
|
||||
{ .mib; .restore sp
|
||||
mov sp=prevsp
|
||||
mov pr=prevpr,0x1ffff
|
||||
br.ret.sptk.many b0 };;
|
||||
.endp bn_mul_mont_general#
|
||||
|
||||
a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
|
||||
n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
|
||||
t0=r15;
|
||||
|
||||
ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
|
||||
ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
|
||||
|
||||
.align 64
|
||||
.skip 48 // aligns loop body
|
||||
.local bn_mul_mont_8#
|
||||
.proc bn_mul_mont_8#
|
||||
bn_mul_mont_8:
|
||||
.prologue
|
||||
{ .mmi; .save ar.pfs,prevfs
|
||||
alloc prevfs=ar.pfs,6,2,0,8
|
||||
.vframe prevsp
|
||||
mov prevsp=sp
|
||||
.save ar.lc,prevlc
|
||||
mov prevlc=ar.lc }
|
||||
{ .mmi; add r17=-6*16,sp
|
||||
add sp=-7*16,sp
|
||||
.save pr,prevpr
|
||||
mov prevpr=pr };;
|
||||
|
||||
{ .mmi; .save.gf 0,0x10
|
||||
stf.spill [sp]=f16,-16
|
||||
.save.gf 0,0x20
|
||||
stf.spill [r17]=f17,32
|
||||
add r16=-5*16,prevsp};;
|
||||
{ .mmi; .save.gf 0,0x40
|
||||
stf.spill [r16]=f18,32
|
||||
.save.gf 0,0x80
|
||||
stf.spill [r17]=f19,32
|
||||
$ADDP aptr=0,in1 };;
|
||||
{ .mmi; .save.gf 0,0x100
|
||||
stf.spill [r16]=f20,32
|
||||
.save.gf 0,0x200
|
||||
stf.spill [r17]=f21,32
|
||||
$ADDP r29=8,in1 };;
|
||||
{ .mmi; .save.gf 0,0x400
|
||||
stf.spill [r16]=f22
|
||||
.save.gf 0,0x800
|
||||
stf.spill [r17]=f23
|
||||
$ADDP rptr=0,in0 };;
|
||||
|
||||
.body
|
||||
.rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
|
||||
.rotr t[8]
|
||||
|
||||
// load input vectors padding them to 8 elements
|
||||
{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
|
||||
ldf8 ai1=[r29],16 // ap[1]
|
||||
$ADDP bptr=0,in2 }
|
||||
{ .mmi; $ADDP r30=8,in2
|
||||
$ADDP nptr=0,in3
|
||||
$ADDP r31=8,in3 };;
|
||||
{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
|
||||
ldf8 bj[6]=[r30],16 // bp[1]
|
||||
cmp4.le p4,p5=3,in5 }
|
||||
{ .mmi; ldf8 ni0=[nptr],16 // np[0]
|
||||
ldf8 ni1=[r31],16 // np[1]
|
||||
cmp4.le p6,p7=4,in5 };;
|
||||
|
||||
{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
|
||||
(p5)fcvt.fxu ai2=f0
|
||||
cmp4.le p8,p9=5,in5 }
|
||||
{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
|
||||
(p7)fcvt.fxu ai3=f0
|
||||
cmp4.le p10,p11=6,in5 }
|
||||
{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
|
||||
(p5)fcvt.fxu bj[5]=f0
|
||||
cmp4.le p12,p13=7,in5 }
|
||||
{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
|
||||
(p7)fcvt.fxu bj[4]=f0
|
||||
cmp4.le p14,p15=8,in5 }
|
||||
{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
|
||||
(p5)fcvt.fxu ni2=f0
|
||||
addp4 r28=-1,in5 }
|
||||
{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
|
||||
(p7)fcvt.fxu ni3=f0
|
||||
$ADDP in4=0,in4 };;
|
||||
|
||||
{ .mfi; ldf8 n0=[in4]
|
||||
fcvt.fxu tf[1]=f0
|
||||
nop.i 0 }
|
||||
|
||||
{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
|
||||
(p9)fcvt.fxu ai4=f0
|
||||
mov t[0]=r0 }
|
||||
{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
|
||||
(p11)fcvt.fxu ai5=f0
|
||||
mov t[1]=r0 }
|
||||
{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
|
||||
(p9)fcvt.fxu bj[3]=f0
|
||||
mov t[2]=r0 }
|
||||
{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
|
||||
(p11)fcvt.fxu bj[2]=f0
|
||||
mov t[3]=r0 }
|
||||
{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
|
||||
(p9)fcvt.fxu ni4=f0
|
||||
mov t[4]=r0 }
|
||||
{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
|
||||
(p11)fcvt.fxu ni5=f0
|
||||
mov t[5]=r0 };;
|
||||
|
||||
{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
|
||||
(p13)fcvt.fxu ai6=f0
|
||||
mov t[6]=r0 }
|
||||
{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
|
||||
(p15)fcvt.fxu ai7=f0
|
||||
mov t[7]=r0 }
|
||||
{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
|
||||
(p13)fcvt.fxu bj[1]=f0
|
||||
mov ar.lc=r28 }
|
||||
{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
|
||||
(p15)fcvt.fxu bj[0]=f0
|
||||
mov ar.ec=1 }
|
||||
{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
|
||||
(p13)fcvt.fxu ni6=f0
|
||||
mov pr.rot=1<<16 }
|
||||
{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
|
||||
(p15)fcvt.fxu ni7=f0
|
||||
brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
|
||||
};;
|
||||
|
||||
// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
|
||||
// to measure with help of Interval Time Counter indicated that the
|
||||
// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
|
||||
// addressing the issue is problematic, because I don't have access
|
||||
// to platform-specific instruction-level profiler. On Itanium it
|
||||
// should run in 56*n ticks, because of higher xma latency...
|
||||
.Louter_8_ctop:
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mfi; (p16) nop.m 0 // 0:
|
||||
(p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
|
||||
(p40) add a3=a3,n3 } // (p17) a3+=n3
|
||||
{ .mfi; (p42) add a3=a3,n3,1
|
||||
(p16) xma.lu alo[0]=ai0,bj[7],tf[1]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig a7=alo[8] // 1:
|
||||
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
|
||||
(p50) add t[6]=t[6],a3,1 };;
|
||||
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
|
||||
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
|
||||
(p40) cmp.ltu p43,p41=a3,n3 }
|
||||
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
|
||||
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
|
||||
(p48) cmp.ltu p51,p49=t[6],a3
|
||||
(p50) cmp.leu p51,p49=t[6],a3 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mfi; (p16) nop.m 0 // 4:
|
||||
(p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
|
||||
(p41) add a4=a4,n4 } // (p17) a4+=n4
|
||||
{ .mfi; (p43) add a4=a4,n4,1
|
||||
(p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
|
||||
(p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
|
||||
(p51) add t[5]=t[5],a4,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 6:
|
||||
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
|
||||
(p41) cmp.ltu p42,p40=a4,n4 }
|
||||
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
|
||||
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
|
||||
(p49) cmp.ltu p50,p48=t[5],a4
|
||||
(p51) cmp.leu p50,p48=t[5],a4 };;
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mfi; (p16) nop.m 0 // 8:
|
||||
(p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
|
||||
(p40) add a5=a5,n5 } // (p17) a5+=n5
|
||||
{ .mfi; (p42) add a5=a5,n5,1
|
||||
(p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig a1=alo[1] // 9:
|
||||
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
|
||||
(p50) add t[4]=t[4],a5,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 10:
|
||||
(p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
|
||||
(p40) cmp.ltu p43,p41=a5,n5 }
|
||||
{ .mfi; (p42) cmp.leu p43,p41=a5,n5
|
||||
(p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
|
||||
(p48) cmp.ltu p51,p49=t[4],a5
|
||||
(p50) cmp.leu p51,p49=t[4],a5 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
|
||||
(p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
|
||||
(p41) add a6=a6,n6 } // (p17) a6+=n6
|
||||
{ .mfi; (p43) add a6=a6,n6,1
|
||||
(p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig a2=alo[2] // 13:
|
||||
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
|
||||
(p51) add t[3]=t[3],a6,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 14:
|
||||
(p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
|
||||
(p41) cmp.ltu p42,p40=a6,n6 }
|
||||
{ .mfi; (p43) cmp.leu p42,p40=a6,n6
|
||||
(p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) nop.m 0 // 15:
|
||||
(p49) cmp.ltu p50,p48=t[3],a6
|
||||
(p51) cmp.leu p50,p48=t[3],a6 };;
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mfi; (p16) nop.m 0 // 16:
|
||||
(p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
|
||||
(p40) add a7=a7,n7 } // (p17) a7+=n7
|
||||
{ .mfi; (p42) add a7=a7,n7,1
|
||||
(p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig a3=alo[3] // 17:
|
||||
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
|
||||
(p50) add t[2]=t[2],a7,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 18:
|
||||
(p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
|
||||
(p40) cmp.ltu p43,p41=a7,n7 }
|
||||
{ .mfi; (p42) cmp.leu p43,p41=a7,n7
|
||||
(p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig n1=nlo[1] // 19:
|
||||
(p48) cmp.ltu p51,p49=t[2],a7
|
||||
(p50) cmp.leu p51,p49=t[2],a7 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mfi; (p16) nop.m 0 // 20:
|
||||
(p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
|
||||
(p41) add a8=a8,n8 } // (p17) a8+=n8
|
||||
{ .mfi; (p43) add a8=a8,n8,1
|
||||
(p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig a4=alo[4] // 21:
|
||||
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
|
||||
(p51) add t[1]=t[1],a8,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 22:
|
||||
(p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
|
||||
(p41) cmp.ltu p42,p40=a8,n8 }
|
||||
{ .mfi; (p43) cmp.leu p42,p40=a8,n8
|
||||
(p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig n2=nlo[2] // 23:
|
||||
(p49) cmp.ltu p50,p48=t[1],a8
|
||||
(p51) cmp.leu p50,p48=t[1],a8 };;
|
||||
{ .mfi; (p16) nop.m 0 // 24:
|
||||
(p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
|
||||
(p16) add a1=a1,n1 } // (p16) a1+=n1
|
||||
{ .mfi; (p16) nop.m 0
|
||||
(p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
|
||||
(p17) mov t[0]=r0 };;
|
||||
{ .mii; (p16) getf.sig a5=alo[5] // 25:
|
||||
(p16) add t0=t[7],a1 // (p16) t[7]+=a1
|
||||
(p42) add t[0]=t[0],r0,1 };;
|
||||
{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
|
||||
(p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
|
||||
(p50) add t[0]=t[0],r0,1 }
|
||||
{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
|
||||
(p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig n3=nlo[3] // 27:
|
||||
(p16) cmp.ltu.unc p50,p48=t0,a1
|
||||
(p16) nop.i 0 };;
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mfi; (p16) nop.m 0 // 28:
|
||||
(p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
|
||||
(p40) add a2=a2,n2 } // (p16) a2+=n2
|
||||
{ .mfi; (p42) add a2=a2,n2,1
|
||||
(p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mii; (p16) getf.sig a6=alo[6] // 29:
|
||||
(p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
|
||||
(p50) add t[6]=t[6],a2,1 };;
|
||||
{ .mfi; (p16) nop.m 0 // 30:
|
||||
(p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
|
||||
(p40) cmp.ltu p41,p39=a2,n2 }
|
||||
{ .mfi; (p42) cmp.leu p41,p39=a2,n2
|
||||
(p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
|
||||
(p16) nop.i 0 };;
|
||||
{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
|
||||
(p16) nop.f 0
|
||||
(p48) cmp.ltu p49,p47=t[6],a2 }
|
||||
{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
|
||||
(p16) nop.f 0
|
||||
br.ctop.sptk.many .Louter_8_ctop };;
|
||||
.Louter_8_cend:
|
||||
|
||||
// above loop has to execute one more time, without (p16), which is
|
||||
// replaced with merged move of np[8] to GPR bank
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mmi; (p0) getf.sig n1=ni0 // 0:
|
||||
(p40) add a3=a3,n3 // (p17) a3+=n3
|
||||
(p42) add a3=a3,n3,1 };;
|
||||
{ .mii; (p17) getf.sig a7=alo[8] // 1:
|
||||
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
|
||||
(p50) add t[6]=t[6],a3,1 };;
|
||||
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
|
||||
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
|
||||
(p40) cmp.ltu p43,p41=a3,n3 }
|
||||
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
|
||||
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
|
||||
(p0) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
|
||||
(p48) cmp.ltu p51,p49=t[6],a3
|
||||
(p50) cmp.leu p51,p49=t[6],a3 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mmi; (p0) getf.sig n2=ni1 // 4:
|
||||
(p41) add a4=a4,n4 // (p17) a4+=n4
|
||||
(p43) add a4=a4,n4,1 };;
|
||||
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
|
||||
(p0) nop.f 0
|
||||
(p51) add t[5]=t[5],a4,1 };;
|
||||
{ .mfi; (p0) getf.sig n3=ni2 // 6:
|
||||
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
|
||||
(p41) cmp.ltu p42,p40=a4,n4 }
|
||||
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
|
||||
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
|
||||
(p0) nop.i 0 };;
|
||||
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
|
||||
(p49) cmp.ltu p50,p48=t[5],a4
|
||||
(p51) cmp.leu p50,p48=t[5],a4 };;
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mii; (p0) getf.sig n4=ni3 // 8:
|
||||
(p40) add a5=a5,n5 // (p17) a5+=n5
|
||||
(p42) add a5=a5,n5,1 };;
|
||||
{ .mii; (p0) nop.m 0 // 9:
|
||||
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
|
||||
(p50) add t[4]=t[4],a5,1 };;
|
||||
{ .mii; (p0) nop.m 0 // 10:
|
||||
(p40) cmp.ltu p43,p41=a5,n5
|
||||
(p42) cmp.leu p43,p41=a5,n5 };;
|
||||
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
|
||||
(p48) cmp.ltu p51,p49=t[4],a5
|
||||
(p50) cmp.leu p51,p49=t[4],a5 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mii; (p17) getf.sig n8=nhi[8] // 12:
|
||||
(p41) add a6=a6,n6 // (p17) a6+=n6
|
||||
(p43) add a6=a6,n6,1 };;
|
||||
{ .mii; (p0) getf.sig n5=ni4 // 13:
|
||||
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
|
||||
(p51) add t[3]=t[3],a6,1 };;
|
||||
{ .mii; (p0) nop.m 0 // 14:
|
||||
(p41) cmp.ltu p42,p40=a6,n6
|
||||
(p43) cmp.leu p42,p40=a6,n6 };;
|
||||
{ .mii; (p0) getf.sig n6=ni5 // 15:
|
||||
(p49) cmp.ltu p50,p48=t[3],a6
|
||||
(p51) cmp.leu p50,p48=t[3],a6 };;
|
||||
.pred.rel "mutex",p40,p42
|
||||
.pred.rel "mutex",p48,p50
|
||||
{ .mii; (p0) nop.m 0 // 16:
|
||||
(p40) add a7=a7,n7 // (p17) a7+=n7
|
||||
(p42) add a7=a7,n7,1 };;
|
||||
{ .mii; (p0) nop.m 0 // 17:
|
||||
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
|
||||
(p50) add t[2]=t[2],a7,1 };;
|
||||
{ .mii; (p0) nop.m 0 // 18:
|
||||
(p40) cmp.ltu p43,p41=a7,n7
|
||||
(p42) cmp.leu p43,p41=a7,n7 };;
|
||||
{ .mii; (p0) getf.sig n7=ni6 // 19:
|
||||
(p48) cmp.ltu p51,p49=t[2],a7
|
||||
(p50) cmp.leu p51,p49=t[2],a7 };;
|
||||
.pred.rel "mutex",p41,p43
|
||||
.pred.rel "mutex",p49,p51
|
||||
{ .mii; (p0) nop.m 0 // 20:
|
||||
(p41) add a8=a8,n8 // (p17) a8+=n8
|
||||
(p43) add a8=a8,n8,1 };;
|
||||
{ .mmi; (p0) nop.m 0 // 21:
|
||||
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
|
||||
(p51) add t[1]=t[1],a8,1 }
|
||||
{ .mmi; (p17) mov t[0]=r0
|
||||
(p41) cmp.ltu p42,p40=a8,n8
|
||||
(p43) cmp.leu p42,p40=a8,n8 };;
|
||||
{ .mmi; (p0) getf.sig n8=ni7 // 22:
|
||||
(p49) cmp.ltu p50,p48=t[1],a8
|
||||
(p51) cmp.leu p50,p48=t[1],a8 }
|
||||
{ .mmi; (p42) add t[0]=t[0],r0,1
|
||||
(p0) add r16=-7*16,prevsp
|
||||
(p0) add r17=-6*16,prevsp };;
|
||||
|
||||
// subtract np[8] from carrybit|tmp[8]
|
||||
// carrybit|tmp[8] layout upon exit from above loop is:
|
||||
// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
|
||||
{ .mmi; (p50)add t[0]=t[0],r0,1
|
||||
add r18=-5*16,prevsp
|
||||
sub n1=t0,n1 };;
|
||||
{ .mmi; cmp.gtu p34,p32=n1,t0;;
|
||||
.pred.rel "mutex",p32,p34
|
||||
(p32)sub n2=t[7],n2
|
||||
(p34)sub n2=t[7],n2,1 };;
|
||||
{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
|
||||
(p34)cmp.geu p35,p33=n2,t[7];;
|
||||
.pred.rel "mutex",p33,p35
|
||||
(p33)sub n3=t[6],n3 }
|
||||
{ .mmi; (p35)sub n3=t[6],n3,1;;
|
||||
(p33)cmp.gtu p34,p32=n3,t[6]
|
||||
(p35)cmp.geu p34,p32=n3,t[6] };;
|
||||
.pred.rel "mutex",p32,p34
|
||||
{ .mii; (p32)sub n4=t[5],n4
|
||||
(p34)sub n4=t[5],n4,1;;
|
||||
(p32)cmp.gtu p35,p33=n4,t[5] }
|
||||
{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
|
||||
.pred.rel "mutex",p33,p35
|
||||
(p33)sub n5=t[4],n5
|
||||
(p35)sub n5=t[4],n5,1 };;
|
||||
{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
|
||||
(p35)cmp.geu p34,p32=n5,t[4];;
|
||||
.pred.rel "mutex",p32,p34
|
||||
(p32)sub n6=t[3],n6 }
|
||||
{ .mmi; (p34)sub n6=t[3],n6,1;;
|
||||
(p32)cmp.gtu p35,p33=n6,t[3]
|
||||
(p34)cmp.geu p35,p33=n6,t[3] };;
|
||||
.pred.rel "mutex",p33,p35
|
||||
{ .mii; (p33)sub n7=t[2],n7
|
||||
(p35)sub n7=t[2],n7,1;;
|
||||
(p33)cmp.gtu p34,p32=n7,t[2] }
|
||||
{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
|
||||
.pred.rel "mutex",p32,p34
|
||||
(p32)sub n8=t[1],n8
|
||||
(p34)sub n8=t[1],n8,1 };;
|
||||
{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
|
||||
(p34)cmp.geu p35,p33=n8,t[1];;
|
||||
.pred.rel "mutex",p33,p35
|
||||
(p33)sub a8=t[0],r0 }
|
||||
{ .mmi; (p35)sub a8=t[0],r0,1;;
|
||||
(p33)cmp.gtu p34,p32=a8,t[0]
|
||||
(p35)cmp.geu p34,p32=a8,t[0] };;
|
||||
|
||||
// save the result, either tmp[num] or tmp[num]-np[num]
|
||||
.pred.rel "mutex",p32,p34
|
||||
{ .mmi; (p32)st8 [rptr]=n1,8
|
||||
(p34)st8 [rptr]=t0,8
|
||||
add r19=-4*16,prevsp};;
|
||||
{ .mmb; (p32)st8 [rptr]=n2,8
|
||||
(p34)st8 [rptr]=t[7],8
|
||||
(p5)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n3,8
|
||||
(p34)st8 [rptr]=t[6],8
|
||||
(p7)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n4,8
|
||||
(p34)st8 [rptr]=t[5],8
|
||||
(p9)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n5,8
|
||||
(p34)st8 [rptr]=t[4],8
|
||||
(p11)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n6,8
|
||||
(p34)st8 [rptr]=t[3],8
|
||||
(p13)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n7,8
|
||||
(p34)st8 [rptr]=t[2],8
|
||||
(p15)br.cond.dpnt.few .Ldone };;
|
||||
{ .mmb; (p32)st8 [rptr]=n8,8
|
||||
(p34)st8 [rptr]=t[1],8
|
||||
nop.b 0 };;
|
||||
.Ldone: // epilogue
|
||||
{ .mmi; ldf.fill f16=[r16],64
|
||||
ldf.fill f17=[r17],64
|
||||
nop.i 0 }
|
||||
{ .mmi; ldf.fill f18=[r18],64
|
||||
ldf.fill f19=[r19],64
|
||||
mov pr=prevpr,0x1ffff };;
|
||||
{ .mmi; ldf.fill f20=[r16]
|
||||
ldf.fill f21=[r17]
|
||||
mov ar.lc=prevlc }
|
||||
{ .mmi; ldf.fill f22=[r18]
|
||||
ldf.fill f23=[r19]
|
||||
mov ret0=1 } // signal "handled"
|
||||
{ .mib; rum 1<<5
|
||||
.restore sp
|
||||
mov sp=prevsp
|
||||
br.ret.sptk.many b0 };;
|
||||
.endp bn_mul_mont_8#
|
||||
|
||||
.type copyright#,\@object
|
||||
copyright:
|
||||
stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$output=shift and open STDOUT,">$output";
|
||||
print $code;
|
||||
close STDOUT;
|
||||
1555
openssl-1.0.2f/crypto/bn/asm/ia64.S
Normal file
1555
openssl-1.0.2f/crypto/bn/asm/ia64.S
Normal file
File diff suppressed because it is too large
Load Diff
426
openssl-1.0.2f/crypto/bn/asm/mips-mont.pl
Normal file
426
openssl-1.0.2f/crypto/bn/asm/mips-mont.pl
Normal file
@@ -0,0 +1,426 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# This module doesn't present direct interest for OpenSSL, because it
|
||||
# doesn't provide better performance for longer keys, at least not on
|
||||
# in-order-execution cores. While 512-bit RSA sign operations can be
|
||||
# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
|
||||
# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
|
||||
# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
|
||||
# verify:-( All comparisons are against bn_mul_mont-free assembler.
|
||||
# The module might be of interest to embedded system developers, as
|
||||
# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
|
||||
# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
|
||||
# code.
|
||||
|
||||
######################################################################
|
||||
# There is a number of MIPS ABI in use, O32 and N32/64 are most
|
||||
# widely used. Then there is a new contender: NUBI. It appears that if
|
||||
# one picks the latter, it's possible to arrange code in ABI neutral
|
||||
# manner. Therefore let's stick to NUBI register layout:
|
||||
#
|
||||
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
|
||||
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
||||
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
|
||||
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
|
||||
#
|
||||
# The return value is placed in $a0. Following coding rules facilitate
|
||||
# interoperability:
|
||||
#
|
||||
# - never ever touch $tp, "thread pointer", former $gp;
|
||||
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
|
||||
# old code];
|
||||
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
|
||||
#
|
||||
# For reference here is register layout for N32/64 MIPS ABIs:
|
||||
#
|
||||
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
|
||||
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
||||
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
|
||||
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
|
||||
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
|
||||
#
|
||||
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
|
||||
|
||||
if ($flavour =~ /64|n32/i) {
|
||||
$PTR_ADD="dadd"; # incidentally works even on n32
|
||||
$PTR_SUB="dsub"; # incidentally works even on n32
|
||||
$REG_S="sd";
|
||||
$REG_L="ld";
|
||||
$SZREG=8;
|
||||
} else {
|
||||
$PTR_ADD="add";
|
||||
$PTR_SUB="sub";
|
||||
$REG_S="sw";
|
||||
$REG_L="lw";
|
||||
$SZREG=4;
|
||||
}
|
||||
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
|
||||
#
|
||||
# <appro@openssl.org>
|
||||
#
|
||||
######################################################################
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64|n32/i) {
|
||||
$LD="ld";
|
||||
$ST="sd";
|
||||
$MULTU="dmultu";
|
||||
$ADDU="daddu";
|
||||
$SUBU="dsubu";
|
||||
$BNSZ=8;
|
||||
} else {
|
||||
$LD="lw";
|
||||
$ST="sw";
|
||||
$MULTU="multu";
|
||||
$ADDU="addu";
|
||||
$SUBU="subu";
|
||||
$BNSZ=4;
|
||||
}
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp=$a0; # BN_ULONG *rp,
|
||||
$ap=$a1; # const BN_ULONG *ap,
|
||||
$bp=$a2; # const BN_ULONG *bp,
|
||||
$np=$a3; # const BN_ULONG *np,
|
||||
$n0=$a4; # const BN_ULONG *n0,
|
||||
$num=$a5; # int num);
|
||||
|
||||
$lo0=$a6;
|
||||
$hi0=$a7;
|
||||
$lo1=$t1;
|
||||
$hi1=$t2;
|
||||
$aj=$s0;
|
||||
$bi=$s1;
|
||||
$nj=$s2;
|
||||
$tp=$s3;
|
||||
$alo=$s4;
|
||||
$ahi=$s5;
|
||||
$nlo=$s6;
|
||||
$nhi=$s7;
|
||||
$tj=$s8;
|
||||
$i=$s9;
|
||||
$j=$s10;
|
||||
$m1=$s11;
|
||||
|
||||
$FRAMESIZE=14;
|
||||
|
||||
$code=<<___;
|
||||
.text
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
|
||||
.align 5
|
||||
.globl bn_mul_mont
|
||||
.ent bn_mul_mont
|
||||
bn_mul_mont:
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /o32/i);
|
||||
lw $n0,16($sp)
|
||||
lw $num,20($sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
slt $at,$num,4
|
||||
bnez $at,1f
|
||||
li $t0,0
|
||||
slt $at,$num,17 # on in-order CPU
|
||||
bnez $at,bn_mul_mont_internal
|
||||
nop
|
||||
1: jr $ra
|
||||
li $a0,0
|
||||
.end bn_mul_mont
|
||||
|
||||
.align 5
|
||||
.ent bn_mul_mont_internal
|
||||
bn_mul_mont_internal:
|
||||
.frame $fp,$FRAMESIZE*$SZREG,$ra
|
||||
.mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
|
||||
$PTR_SUB $sp,$FRAMESIZE*$SZREG
|
||||
$REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
|
||||
$REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
|
||||
$REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
|
||||
$REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
|
||||
$REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
|
||||
$REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
|
||||
$REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
|
||||
$REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
|
||||
$REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /nubi/i);
|
||||
$REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
|
||||
$REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
|
||||
$REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
|
||||
$REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
move $fp,$sp
|
||||
|
||||
.set reorder
|
||||
$LD $n0,0($n0)
|
||||
$LD $bi,0($bp) # bp[0]
|
||||
$LD $aj,0($ap) # ap[0]
|
||||
$LD $nj,0($np) # np[0]
|
||||
|
||||
$PTR_SUB $sp,2*$BNSZ # place for two extra words
|
||||
sll $num,`log($BNSZ)/log(2)`
|
||||
li $at,-4096
|
||||
$PTR_SUB $sp,$num
|
||||
and $sp,$at
|
||||
|
||||
$MULTU $aj,$bi
|
||||
$LD $alo,$BNSZ($ap)
|
||||
$LD $nlo,$BNSZ($np)
|
||||
mflo $lo0
|
||||
mfhi $hi0
|
||||
$MULTU $lo0,$n0
|
||||
mflo $m1
|
||||
|
||||
$MULTU $alo,$bi
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
$MULTU $nj,$m1
|
||||
mflo $lo1
|
||||
mfhi $hi1
|
||||
$MULTU $nlo,$m1
|
||||
$ADDU $lo1,$lo0
|
||||
sltu $at,$lo1,$lo0
|
||||
$ADDU $hi1,$at
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
move $tp,$sp
|
||||
li $j,2*$BNSZ
|
||||
.align 4
|
||||
.L1st:
|
||||
.set noreorder
|
||||
$PTR_ADD $aj,$ap,$j
|
||||
$PTR_ADD $nj,$np,$j
|
||||
$LD $aj,($aj)
|
||||
$LD $nj,($nj)
|
||||
|
||||
$MULTU $aj,$bi
|
||||
$ADDU $lo0,$alo,$hi0
|
||||
$ADDU $lo1,$nlo,$hi1
|
||||
sltu $at,$lo0,$hi0
|
||||
sltu $t0,$lo1,$hi1
|
||||
$ADDU $hi0,$ahi,$at
|
||||
$ADDU $hi1,$nhi,$t0
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
$ADDU $lo1,$lo0
|
||||
sltu $at,$lo1,$lo0
|
||||
$MULTU $nj,$m1
|
||||
$ADDU $hi1,$at
|
||||
addu $j,$BNSZ
|
||||
$ST $lo1,($tp)
|
||||
sltu $t0,$j,$num
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
bnez $t0,.L1st
|
||||
$PTR_ADD $tp,$BNSZ
|
||||
.set reorder
|
||||
|
||||
$ADDU $lo0,$alo,$hi0
|
||||
sltu $at,$lo0,$hi0
|
||||
$ADDU $hi0,$ahi,$at
|
||||
|
||||
$ADDU $lo1,$nlo,$hi1
|
||||
sltu $t0,$lo1,$hi1
|
||||
$ADDU $hi1,$nhi,$t0
|
||||
$ADDU $lo1,$lo0
|
||||
sltu $at,$lo1,$lo0
|
||||
$ADDU $hi1,$at
|
||||
|
||||
$ST $lo1,($tp)
|
||||
|
||||
$ADDU $hi1,$hi0
|
||||
sltu $at,$hi1,$hi0
|
||||
$ST $hi1,$BNSZ($tp)
|
||||
$ST $at,2*$BNSZ($tp)
|
||||
|
||||
li $i,$BNSZ
|
||||
.align 4
|
||||
.Louter:
|
||||
$PTR_ADD $bi,$bp,$i
|
||||
$LD $bi,($bi)
|
||||
$LD $aj,($ap)
|
||||
$LD $alo,$BNSZ($ap)
|
||||
$LD $tj,($sp)
|
||||
|
||||
$MULTU $aj,$bi
|
||||
$LD $nj,($np)
|
||||
$LD $nlo,$BNSZ($np)
|
||||
mflo $lo0
|
||||
mfhi $hi0
|
||||
$ADDU $lo0,$tj
|
||||
$MULTU $lo0,$n0
|
||||
sltu $at,$lo0,$tj
|
||||
$ADDU $hi0,$at
|
||||
mflo $m1
|
||||
|
||||
$MULTU $alo,$bi
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
$MULTU $nj,$m1
|
||||
mflo $lo1
|
||||
mfhi $hi1
|
||||
|
||||
$MULTU $nlo,$m1
|
||||
$ADDU $lo1,$lo0
|
||||
sltu $at,$lo1,$lo0
|
||||
$ADDU $hi1,$at
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
move $tp,$sp
|
||||
li $j,2*$BNSZ
|
||||
$LD $tj,$BNSZ($tp)
|
||||
.align 4
|
||||
.Linner:
|
||||
.set noreorder
|
||||
$PTR_ADD $aj,$ap,$j
|
||||
$PTR_ADD $nj,$np,$j
|
||||
$LD $aj,($aj)
|
||||
$LD $nj,($nj)
|
||||
|
||||
$MULTU $aj,$bi
|
||||
$ADDU $lo0,$alo,$hi0
|
||||
$ADDU $lo1,$nlo,$hi1
|
||||
sltu $at,$lo0,$hi0
|
||||
sltu $t0,$lo1,$hi1
|
||||
$ADDU $hi0,$ahi,$at
|
||||
$ADDU $hi1,$nhi,$t0
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
$ADDU $lo0,$tj
|
||||
addu $j,$BNSZ
|
||||
$MULTU $nj,$m1
|
||||
sltu $at,$lo0,$tj
|
||||
$ADDU $lo1,$lo0
|
||||
$ADDU $hi0,$at
|
||||
sltu $t0,$lo1,$lo0
|
||||
$LD $tj,2*$BNSZ($tp)
|
||||
$ADDU $hi1,$t0
|
||||
sltu $at,$j,$num
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
$ST $lo1,($tp)
|
||||
bnez $at,.Linner
|
||||
$PTR_ADD $tp,$BNSZ
|
||||
.set reorder
|
||||
|
||||
$ADDU $lo0,$alo,$hi0
|
||||
sltu $at,$lo0,$hi0
|
||||
$ADDU $hi0,$ahi,$at
|
||||
$ADDU $lo0,$tj
|
||||
sltu $t0,$lo0,$tj
|
||||
$ADDU $hi0,$t0
|
||||
|
||||
$LD $tj,2*$BNSZ($tp)
|
||||
$ADDU $lo1,$nlo,$hi1
|
||||
sltu $at,$lo1,$hi1
|
||||
$ADDU $hi1,$nhi,$at
|
||||
$ADDU $lo1,$lo0
|
||||
sltu $t0,$lo1,$lo0
|
||||
$ADDU $hi1,$t0
|
||||
$ST $lo1,($tp)
|
||||
|
||||
$ADDU $lo1,$hi1,$hi0
|
||||
sltu $hi1,$lo1,$hi0
|
||||
$ADDU $lo1,$tj
|
||||
sltu $at,$lo1,$tj
|
||||
$ADDU $hi1,$at
|
||||
$ST $lo1,$BNSZ($tp)
|
||||
$ST $hi1,2*$BNSZ($tp)
|
||||
|
||||
addu $i,$BNSZ
|
||||
sltu $t0,$i,$num
|
||||
bnez $t0,.Louter
|
||||
|
||||
.set noreorder
|
||||
$PTR_ADD $tj,$sp,$num # &tp[num]
|
||||
move $tp,$sp
|
||||
move $ap,$sp
|
||||
li $hi0,0 # clear borrow bit
|
||||
|
||||
.align 4
|
||||
.Lsub: $LD $lo0,($tp)
|
||||
$LD $lo1,($np)
|
||||
$PTR_ADD $tp,$BNSZ
|
||||
$PTR_ADD $np,$BNSZ
|
||||
$SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
|
||||
sgtu $at,$lo1,$lo0
|
||||
$SUBU $lo0,$lo1,$hi0
|
||||
sgtu $hi0,$lo0,$lo1
|
||||
$ST $lo0,($rp)
|
||||
or $hi0,$at
|
||||
sltu $at,$tp,$tj
|
||||
bnez $at,.Lsub
|
||||
$PTR_ADD $rp,$BNSZ
|
||||
|
||||
$SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
|
||||
move $tp,$sp
|
||||
$PTR_SUB $rp,$num # restore rp
|
||||
not $hi1,$hi0
|
||||
|
||||
and $ap,$hi0,$sp
|
||||
and $bp,$hi1,$rp
|
||||
or $ap,$ap,$bp # ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
.Lcopy: $LD $aj,($ap)
|
||||
$PTR_ADD $ap,$BNSZ
|
||||
$ST $zero,($tp)
|
||||
$PTR_ADD $tp,$BNSZ
|
||||
sltu $at,$tp,$tj
|
||||
$ST $aj,($rp)
|
||||
bnez $at,.Lcopy
|
||||
$PTR_ADD $rp,$BNSZ
|
||||
|
||||
li $a0,1
|
||||
li $t0,1
|
||||
|
||||
.set noreorder
|
||||
move $sp,$fp
|
||||
$REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
|
||||
$REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
|
||||
$REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
|
||||
$REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
|
||||
$REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
|
||||
$REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
|
||||
$REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
|
||||
$REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
|
||||
$REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /nubi/i);
|
||||
$REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
|
||||
$REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
|
||||
$REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
|
||||
$REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
jr $ra
|
||||
$PTR_ADD $sp,$FRAMESIZE*$SZREG
|
||||
.end bn_mul_mont_internal
|
||||
.rdata
|
||||
.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
||||
2234
openssl-1.0.2f/crypto/bn/asm/mips.pl
Normal file
2234
openssl-1.0.2f/crypto/bn/asm/mips.pl
Normal file
File diff suppressed because it is too large
Load Diff
327
openssl-1.0.2f/crypto/bn/asm/mips3-mont.pl
Normal file
327
openssl-1.0.2f/crypto/bn/asm/mips3-mont.pl
Normal file
@@ -0,0 +1,327 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# This module doesn't present direct interest for OpenSSL, because it
|
||||
# doesn't provide better performance for longer keys. While 512-bit
|
||||
# RSA private key operations are 40% faster, 1024-bit ones are hardly
|
||||
# faster at all, while longer key operations are slower by up to 20%.
|
||||
# It might be of interest to embedded system developers though, as
|
||||
# it's smaller than 1KB, yet offers ~3x improvement over compiler
|
||||
# generated code.
|
||||
#
|
||||
# The module targets N32 and N64 MIPS ABIs and currently is a bit
|
||||
# IRIX-centric, i.e. is likely to require adaptation for other OSes.
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp="a0"; # BN_ULONG *rp,
|
||||
$ap="a1"; # const BN_ULONG *ap,
|
||||
$bp="a2"; # const BN_ULONG *bp,
|
||||
$np="a3"; # const BN_ULONG *np,
|
||||
$n0="a4"; # const BN_ULONG *n0,
|
||||
$num="a5"; # int num);
|
||||
|
||||
$lo0="a6";
|
||||
$hi0="a7";
|
||||
$lo1="v0";
|
||||
$hi1="v1";
|
||||
$aj="t0";
|
||||
$bi="t1";
|
||||
$nj="t2";
|
||||
$tp="t3";
|
||||
$alo="s0";
|
||||
$ahi="s1";
|
||||
$nlo="s2";
|
||||
$nhi="s3";
|
||||
$tj="s4";
|
||||
$i="s5";
|
||||
$j="s6";
|
||||
$fp="t8";
|
||||
$m1="t9";
|
||||
|
||||
$FRAME=8*(2+8);
|
||||
|
||||
$code=<<___;
|
||||
#include <asm.h>
|
||||
#include <regdef.h>
|
||||
|
||||
.text
|
||||
|
||||
.set noat
|
||||
.set reorder
|
||||
|
||||
.align 5
|
||||
.globl bn_mul_mont
|
||||
.ent bn_mul_mont
|
||||
bn_mul_mont:
|
||||
.set noreorder
|
||||
PTR_SUB sp,64
|
||||
move $fp,sp
|
||||
.frame $fp,64,ra
|
||||
slt AT,$num,4
|
||||
li v0,0
|
||||
beqzl AT,.Lproceed
|
||||
nop
|
||||
jr ra
|
||||
PTR_ADD sp,$fp,64
|
||||
.set reorder
|
||||
.align 5
|
||||
.Lproceed:
|
||||
ld $n0,0($n0)
|
||||
ld $bi,0($bp) # bp[0]
|
||||
ld $aj,0($ap) # ap[0]
|
||||
ld $nj,0($np) # np[0]
|
||||
PTR_SUB sp,16 # place for two extra words
|
||||
sll $num,3
|
||||
li AT,-4096
|
||||
PTR_SUB sp,$num
|
||||
and sp,AT
|
||||
|
||||
sd s0,0($fp)
|
||||
sd s1,8($fp)
|
||||
sd s2,16($fp)
|
||||
sd s3,24($fp)
|
||||
sd s4,32($fp)
|
||||
sd s5,40($fp)
|
||||
sd s6,48($fp)
|
||||
sd s7,56($fp)
|
||||
|
||||
dmultu $aj,$bi
|
||||
ld $alo,8($ap)
|
||||
ld $nlo,8($np)
|
||||
mflo $lo0
|
||||
mfhi $hi0
|
||||
dmultu $lo0,$n0
|
||||
mflo $m1
|
||||
|
||||
dmultu $alo,$bi
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
dmultu $nj,$m1
|
||||
mflo $lo1
|
||||
mfhi $hi1
|
||||
dmultu $nlo,$m1
|
||||
daddu $lo1,$lo0
|
||||
sltu AT,$lo1,$lo0
|
||||
daddu $hi1,AT
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
move $tp,sp
|
||||
li $j,16
|
||||
.align 4
|
||||
.L1st:
|
||||
.set noreorder
|
||||
PTR_ADD $aj,$ap,$j
|
||||
ld $aj,($aj)
|
||||
PTR_ADD $nj,$np,$j
|
||||
ld $nj,($nj)
|
||||
|
||||
dmultu $aj,$bi
|
||||
daddu $lo0,$alo,$hi0
|
||||
daddu $lo1,$nlo,$hi1
|
||||
sltu AT,$lo0,$hi0
|
||||
sltu s7,$lo1,$hi1
|
||||
daddu $hi0,$ahi,AT
|
||||
daddu $hi1,$nhi,s7
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
daddu $lo1,$lo0
|
||||
sltu AT,$lo1,$lo0
|
||||
dmultu $nj,$m1
|
||||
daddu $hi1,AT
|
||||
addu $j,8
|
||||
sd $lo1,($tp)
|
||||
sltu s7,$j,$num
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
bnez s7,.L1st
|
||||
PTR_ADD $tp,8
|
||||
.set reorder
|
||||
|
||||
daddu $lo0,$alo,$hi0
|
||||
sltu AT,$lo0,$hi0
|
||||
daddu $hi0,$ahi,AT
|
||||
|
||||
daddu $lo1,$nlo,$hi1
|
||||
sltu s7,$lo1,$hi1
|
||||
daddu $hi1,$nhi,s7
|
||||
daddu $lo1,$lo0
|
||||
sltu AT,$lo1,$lo0
|
||||
daddu $hi1,AT
|
||||
|
||||
sd $lo1,($tp)
|
||||
|
||||
daddu $hi1,$hi0
|
||||
sltu AT,$hi1,$hi0
|
||||
sd $hi1,8($tp)
|
||||
sd AT,16($tp)
|
||||
|
||||
li $i,8
|
||||
.align 4
|
||||
.Louter:
|
||||
PTR_ADD $bi,$bp,$i
|
||||
ld $bi,($bi)
|
||||
ld $aj,($ap)
|
||||
ld $alo,8($ap)
|
||||
ld $tj,(sp)
|
||||
|
||||
dmultu $aj,$bi
|
||||
ld $nj,($np)
|
||||
ld $nlo,8($np)
|
||||
mflo $lo0
|
||||
mfhi $hi0
|
||||
daddu $lo0,$tj
|
||||
dmultu $lo0,$n0
|
||||
sltu AT,$lo0,$tj
|
||||
daddu $hi0,AT
|
||||
mflo $m1
|
||||
|
||||
dmultu $alo,$bi
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
dmultu $nj,$m1
|
||||
mflo $lo1
|
||||
mfhi $hi1
|
||||
|
||||
dmultu $nlo,$m1
|
||||
daddu $lo1,$lo0
|
||||
sltu AT,$lo1,$lo0
|
||||
daddu $hi1,AT
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
|
||||
move $tp,sp
|
||||
li $j,16
|
||||
ld $tj,8($tp)
|
||||
.align 4
|
||||
.Linner:
|
||||
.set noreorder
|
||||
PTR_ADD $aj,$ap,$j
|
||||
ld $aj,($aj)
|
||||
PTR_ADD $nj,$np,$j
|
||||
ld $nj,($nj)
|
||||
|
||||
dmultu $aj,$bi
|
||||
daddu $lo0,$alo,$hi0
|
||||
daddu $lo1,$nlo,$hi1
|
||||
sltu AT,$lo0,$hi0
|
||||
sltu s7,$lo1,$hi1
|
||||
daddu $hi0,$ahi,AT
|
||||
daddu $hi1,$nhi,s7
|
||||
mflo $alo
|
||||
mfhi $ahi
|
||||
|
||||
daddu $lo0,$tj
|
||||
addu $j,8
|
||||
dmultu $nj,$m1
|
||||
sltu AT,$lo0,$tj
|
||||
daddu $lo1,$lo0
|
||||
daddu $hi0,AT
|
||||
sltu s7,$lo1,$lo0
|
||||
ld $tj,16($tp)
|
||||
daddu $hi1,s7
|
||||
sltu AT,$j,$num
|
||||
mflo $nlo
|
||||
mfhi $nhi
|
||||
sd $lo1,($tp)
|
||||
bnez AT,.Linner
|
||||
PTR_ADD $tp,8
|
||||
.set reorder
|
||||
|
||||
daddu $lo0,$alo,$hi0
|
||||
sltu AT,$lo0,$hi0
|
||||
daddu $hi0,$ahi,AT
|
||||
daddu $lo0,$tj
|
||||
sltu s7,$lo0,$tj
|
||||
daddu $hi0,s7
|
||||
|
||||
ld $tj,16($tp)
|
||||
daddu $lo1,$nlo,$hi1
|
||||
sltu AT,$lo1,$hi1
|
||||
daddu $hi1,$nhi,AT
|
||||
daddu $lo1,$lo0
|
||||
sltu s7,$lo1,$lo0
|
||||
daddu $hi1,s7
|
||||
sd $lo1,($tp)
|
||||
|
||||
daddu $lo1,$hi1,$hi0
|
||||
sltu $hi1,$lo1,$hi0
|
||||
daddu $lo1,$tj
|
||||
sltu AT,$lo1,$tj
|
||||
daddu $hi1,AT
|
||||
sd $lo1,8($tp)
|
||||
sd $hi1,16($tp)
|
||||
|
||||
addu $i,8
|
||||
sltu s7,$i,$num
|
||||
bnez s7,.Louter
|
||||
|
||||
.set noreorder
|
||||
PTR_ADD $tj,sp,$num # &tp[num]
|
||||
move $tp,sp
|
||||
move $ap,sp
|
||||
li $hi0,0 # clear borrow bit
|
||||
|
||||
.align 4
|
||||
.Lsub: ld $lo0,($tp)
|
||||
ld $lo1,($np)
|
||||
PTR_ADD $tp,8
|
||||
PTR_ADD $np,8
|
||||
dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
|
||||
sgtu AT,$lo1,$lo0
|
||||
dsubu $lo0,$lo1,$hi0
|
||||
sgtu $hi0,$lo0,$lo1
|
||||
sd $lo0,($rp)
|
||||
or $hi0,AT
|
||||
sltu AT,$tp,$tj
|
||||
bnez AT,.Lsub
|
||||
PTR_ADD $rp,8
|
||||
|
||||
dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
|
||||
move $tp,sp
|
||||
PTR_SUB $rp,$num # restore rp
|
||||
not $hi1,$hi0
|
||||
|
||||
and $ap,$hi0,sp
|
||||
and $bp,$hi1,$rp
|
||||
or $ap,$ap,$bp # ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
.Lcopy: ld $aj,($ap)
|
||||
PTR_ADD $ap,8
|
||||
PTR_ADD $tp,8
|
||||
sd zero,-8($tp)
|
||||
sltu AT,$tp,$tj
|
||||
sd $aj,($rp)
|
||||
bnez AT,.Lcopy
|
||||
PTR_ADD $rp,8
|
||||
|
||||
ld s0,0($fp)
|
||||
ld s1,8($fp)
|
||||
ld s2,16($fp)
|
||||
ld s3,24($fp)
|
||||
ld s4,32($fp)
|
||||
ld s5,40($fp)
|
||||
ld s6,48($fp)
|
||||
ld s7,56($fp)
|
||||
li v0,1
|
||||
jr ra
|
||||
PTR_ADD sp,$fp,64
|
||||
.set reorder
|
||||
END(bn_mul_mont)
|
||||
.rdata
|
||||
.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
||||
2201
openssl-1.0.2f/crypto/bn/asm/mips3.s
Normal file
2201
openssl-1.0.2f/crypto/bn/asm/mips3.s
Normal file
File diff suppressed because it is too large
Load Diff
1618
openssl-1.0.2f/crypto/bn/asm/pa-risc2.s
Normal file
1618
openssl-1.0.2f/crypto/bn/asm/pa-risc2.s
Normal file
File diff suppressed because it is too large
Load Diff
1605
openssl-1.0.2f/crypto/bn/asm/pa-risc2W.s
Normal file
1605
openssl-1.0.2f/crypto/bn/asm/pa-risc2W.s
Normal file
File diff suppressed because it is too large
Load Diff
995
openssl-1.0.2f/crypto/bn/asm/parisc-mont.pl
Normal file
995
openssl-1.0.2f/crypto/bn/asm/parisc-mont.pl
Normal file
@@ -0,0 +1,995 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# On PA-7100LC this module performs ~90-50% better, less for longer
|
||||
# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
|
||||
# that compiler utilized xmpyu instruction to perform 32x32=64-bit
|
||||
# multiplication, which in turn means that "baseline" performance was
|
||||
# optimal in respect to instruction set capabilities. Fair comparison
|
||||
# with vendor compiler is problematic, because OpenSSL doesn't define
|
||||
# BN_LLONG [presumably] for historical reasons, which drives compiler
|
||||
# toward 4 times 16x16=32-bit multiplicatons [plus complementary
|
||||
# shifts and additions] instead. This means that you should observe
|
||||
# several times improvement over code generated by vendor compiler
|
||||
# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
|
||||
# improvement coefficient was never collected on PA-7100LC, or any
|
||||
# other 1.1 CPU, because I don't have access to such machine with
|
||||
# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
|
||||
# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
|
||||
# of ~5x on PA-8600.
|
||||
#
|
||||
# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
|
||||
# reportedly ~2x faster than vendor compiler generated code [according
|
||||
# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
|
||||
# this implementation is actually 32-bit one, in the sense that it
|
||||
# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
|
||||
# 64-bit BN_LONGs... How do they interoperate then? No problem. This
|
||||
# module picks halves of 64-bit values in reverse order and pretends
|
||||
# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
|
||||
# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
|
||||
# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
|
||||
# i.e. there is no "wider" multiplication like on most other 64-bit
|
||||
# platforms. This means that even being effectively 32-bit, this
|
||||
# implementation performs "64-bit" computational task in same amount
|
||||
# of arithmetic operations, most notably multiplications. It requires
|
||||
# more memory references, most notably to tp[num], but this doesn't
|
||||
# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
|
||||
# 2.0 code path provides virtually same performance as pa-risc2[W].s:
|
||||
# it's ~10% better for shortest key length and ~10% worse for longest
|
||||
# one.
|
||||
#
|
||||
# In case it wasn't clear. The module has two distinct code paths:
|
||||
# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
|
||||
# additions and 64-bit integer loads, not to mention specific
|
||||
# instruction scheduling. In 64-bit build naturally only 2.0 code path
|
||||
# is assembled. In 32-bit application context both code paths are
|
||||
# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
|
||||
# is taken automatically. Also, in 32-bit build the module imposes
|
||||
# couple of limitations: vector lengths has to be even and vector
|
||||
# addresses has to be 64-bit aligned. Normally neither is a problem:
|
||||
# most common key lengths are even and vectors are commonly malloc-ed,
|
||||
# which ensures alignment.
|
||||
#
|
||||
# Special thanks to polarhome.com for providing HP-UX account on
|
||||
# PA-RISC 1.1 machine, and to correspondent who chose to remain
|
||||
# anonymous for testing the code on PA-RISC 2.0 machine.
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$LEVEL ="2.0W";
|
||||
$SIZE_T =8;
|
||||
$FRAME_MARKER =80;
|
||||
$SAVED_RP =16;
|
||||
$PUSH ="std";
|
||||
$PUSHMA ="std,ma";
|
||||
$POP ="ldd";
|
||||
$POPMB ="ldd,mb";
|
||||
$BN_SZ =$SIZE_T;
|
||||
} else {
|
||||
$LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
|
||||
$SIZE_T =4;
|
||||
$FRAME_MARKER =48;
|
||||
$SAVED_RP =20;
|
||||
$PUSH ="stw";
|
||||
$PUSHMA ="stwm";
|
||||
$POP ="ldw";
|
||||
$POPMB ="ldwm";
|
||||
$BN_SZ =$SIZE_T;
|
||||
if (open CONF,"<${dir}../../opensslconf.h") {
|
||||
while(<CONF>) {
|
||||
if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
|
||||
$BN_SZ=8;
|
||||
$LEVEL="2.0";
|
||||
last;
|
||||
}
|
||||
}
|
||||
close CONF;
|
||||
}
|
||||
}
|
||||
|
||||
$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
|
||||
# [+ argument transfer]
|
||||
$LOCALS=$FRAME-$FRAME_MARKER;
|
||||
$FRAME+=32; # local variables
|
||||
|
||||
$tp="%r31";
|
||||
$ti1="%r29";
|
||||
$ti0="%r28";
|
||||
|
||||
$rp="%r26";
|
||||
$ap="%r25";
|
||||
$bp="%r24";
|
||||
$np="%r23";
|
||||
$n0="%r22"; # passed through stack in 32-bit
|
||||
$num="%r21"; # passed through stack in 32-bit
|
||||
$idx="%r20";
|
||||
$arrsz="%r19";
|
||||
|
||||
$nm1="%r7";
|
||||
$nm0="%r6";
|
||||
$ab1="%r5";
|
||||
$ab0="%r4";
|
||||
|
||||
$fp="%r3";
|
||||
$hi1="%r2";
|
||||
$hi0="%r1";
|
||||
|
||||
$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
|
||||
|
||||
$fm0="%fr4"; $fti=$fm0;
|
||||
$fbi="%fr5L";
|
||||
$fn0="%fr5R";
|
||||
$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
|
||||
$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
|
||||
|
||||
$code=<<___;
|
||||
.LEVEL $LEVEL
|
||||
.SPACE \$TEXT\$
|
||||
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
||||
|
||||
.EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
|
||||
.ALIGN 64
|
||||
bn_mul_mont
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
||||
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
||||
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
||||
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
||||
ldo -$FRAME(%sp),$fp
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
ldw `-$FRAME_MARKER-4`($fp),$n0
|
||||
ldw `-$FRAME_MARKER-8`($fp),$num
|
||||
nop
|
||||
nop ; alignment
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
comiclr,<= 6,$num,%r0 ; are vectors long enough?
|
||||
b L\$abort
|
||||
ldi 0,%r28 ; signal "unhandled"
|
||||
add,ev %r0,$num,$num ; is $num even?
|
||||
b L\$abort
|
||||
nop
|
||||
or $ap,$np,$ti1
|
||||
extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
|
||||
b L\$abort
|
||||
nop
|
||||
nop ; alignment
|
||||
nop
|
||||
|
||||
fldws 0($n0),${fn0}
|
||||
fldws,ma 4($bp),${fbi} ; bp[0]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
comib,> 3,$num,L\$abort ; are vectors long enough?
|
||||
ldi 0,%r28 ; signal "unhandled"
|
||||
addl $num,$num,$num ; I operate on 32-bit values
|
||||
|
||||
fldws 4($n0),${fn0} ; only low part of n0
|
||||
fldws 4($bp),${fbi} ; bp[0] in flipped word order
|
||||
___
|
||||
$code.=<<___;
|
||||
fldds 0($ap),${fai} ; ap[0,1]
|
||||
fldds 0($np),${fni} ; np[0,1]
|
||||
|
||||
sh2addl $num,%r0,$arrsz
|
||||
ldi 31,$hi0
|
||||
ldo 36($arrsz),$hi1 ; space for tp[num+1]
|
||||
andcm $hi1,$hi0,$hi1 ; align
|
||||
addl $hi1,%sp,%sp
|
||||
$PUSH $fp,-$SIZE_T(%sp)
|
||||
|
||||
ldo `$LOCALS+16`($fp),$xfer
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
|
||||
addl $arrsz,$ap,$ap ; point at the end
|
||||
addl $arrsz,$np,$np
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
ldo 8($idx),$idx ; j++++
|
||||
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
fstds ${fab1},0($xfer)
|
||||
fstds ${fnm1},8($xfer)
|
||||
flddx $idx($ap),${fai} ; ap[2,3]
|
||||
flddx $idx($np),${fni} ; np[2,3]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
mtctl $hi0,%cr11 ; $hi0 still holds 31
|
||||
extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
|
||||
b L\$parisc11
|
||||
nop
|
||||
___
|
||||
$code.=<<___; # PA-RISC 2.0 code-path
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldd -16($xfer),$ab0
|
||||
fstds ${fab0},-16($xfer)
|
||||
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
ldd -8($xfer),$nm0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
ldo 8($idx),$idx ; j++++
|
||||
addl $ab0,$nm0,$nm0 ; low part is discarded
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
|
||||
L\$1st
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
addl $hi1,$nm1,$nm1
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldd -16($xfer),$ab0
|
||||
fstds ${fab0},-16($xfer)
|
||||
addl $hi0,$ab0,$ab0
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
ldd -8($xfer),$nm0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
addl $ab0,$nm0,$nm0
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$1st ; j++++
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
addl $hi1,$nm1,$nm1
|
||||
ldd -16($xfer),$ab0
|
||||
addl $ab1,$nm1,$nm1
|
||||
ldd -8($xfer),$nm0
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
|
||||
addl $hi0,$ab0,$ab0
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
ldd 0($xfer),$ab1
|
||||
addl $ab0,$nm0,$nm0
|
||||
ldd,mb 8($xfer),$nm1
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
|
||||
ldo -1($num),$num ; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
fldws,ma 4($bp),${fbi} ; bp[1]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
fldws 0($bp),${fbi} ; bp[1] in flipped word order
|
||||
___
|
||||
$code.=<<___;
|
||||
flddx $idx($ap),${fai} ; ap[0,1]
|
||||
flddx $idx($np),${fni} ; np[0,1]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
addl $hi0,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
addl $hi1,$hi0,$hi0
|
||||
extrd,u $hi0,31,32,$hi1
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
L\$outer
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
||||
fstds ${fab0},-16($xfer) ; 33-bit value
|
||||
fstds ${fnm0},-8($xfer)
|
||||
flddx $idx($ap),${fai} ; ap[2]
|
||||
flddx $idx($np),${fni} ; np[2]
|
||||
ldo 8($idx),$idx ; j++++
|
||||
ldd -16($xfer),$ab0 ; 33-bit value
|
||||
ldd -8($xfer),$nm0
|
||||
ldw 0($xfer),$hi0 ; high part
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
extrd,u $ab0,31,32,$ti0 ; carry bit
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $ti0,$hi0,$hi0 ; account carry bit
|
||||
fstds ${fnm1},8($xfer)
|
||||
addl $ab0,$nm0,$nm0 ; low part is discarded
|
||||
ldw 0($tp),$ti1 ; tp[1]
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
|
||||
L\$inner
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ti1,$ti1
|
||||
addl $ti1,$ab1,$ab1
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldd -16($xfer),$ab0
|
||||
fstds ${fab0},-16($xfer)
|
||||
addl $hi0,$ti0,$ti0
|
||||
addl $ti0,$ab0,$ab0
|
||||
ldd -8($xfer),$nm0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
addl $ab0,$nm0,$nm0
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$inner ; j++++
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ti1,$ti1
|
||||
addl $ti1,$ab1,$ab1
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
ldd -16($xfer),$ab0
|
||||
ldd -8($xfer),$nm0
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
|
||||
addl $hi0,$ab0,$ab0
|
||||
addl $ti0,$ab0,$ab0
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
ldd 0($xfer),$ab1
|
||||
addl $ab0,$nm0,$nm0
|
||||
ldd,mb 8($xfer),$nm1
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
|
||||
addib,= -1,$num,L\$outerdone ; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
fldws,ma 4($bp),${fbi} ; bp[i]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
ldi 12,$ti0 ; bp[i] in flipped word order
|
||||
addl,ev %r0,$num,$num
|
||||
ldi -4,$ti0
|
||||
addl $ti0,$bp,$bp
|
||||
fldws 0($bp),${fbi}
|
||||
___
|
||||
$code.=<<___;
|
||||
flddx $idx($ap),${fai} ; ap[0]
|
||||
addl $hi0,$ab1,$ab1
|
||||
flddx $idx($np),${fni} ; np[0]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
addl $ti1,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
addl $hi1,$nm1,$nm1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
addl $hi1,$hi0,$hi0
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
addl $ti0,$hi0,$hi0
|
||||
extrd,u $hi0,31,32,$hi1
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
|
||||
b L\$outer
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
|
||||
L\$outerdone
|
||||
addl $hi0,$ab1,$ab1
|
||||
addl $ti1,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
addl $hi1,$hi0,$hi0
|
||||
addl $ti0,$hi0,$hi0
|
||||
extrd,u $hi0,31,32,$hi1
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
sub %r0,%r0,%r0 ; clear borrow
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
ldws,ma 4($tp),$ti0
|
||||
extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
|
||||
b L\$sub_pa11
|
||||
addl $tp,$arrsz,$tp
|
||||
L\$sub
|
||||
ldwx $idx($np),$hi0
|
||||
subb $ti0,$hi0,$hi1
|
||||
ldwx $idx($tp),$ti0
|
||||
addib,<> 4,$idx,L\$sub
|
||||
stws,ma $hi1,4($rp)
|
||||
|
||||
subb $ti0,%r0,$hi1
|
||||
ldo -4($tp),$tp
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
ldd,ma 8($tp),$ti0
|
||||
L\$sub
|
||||
ldd $idx($np),$hi0
|
||||
shrpd $ti0,$ti0,32,$ti0 ; flip word order
|
||||
std $ti0,-8($tp) ; save flipped value
|
||||
sub,db $ti0,$hi0,$hi1
|
||||
ldd,ma 8($tp),$ti0
|
||||
addib,<> 8,$idx,L\$sub
|
||||
std,ma $hi1,8($rp)
|
||||
|
||||
extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
|
||||
sub,db $ti0,%r0,$hi1
|
||||
ldo -8($tp),$tp
|
||||
___
|
||||
$code.=<<___;
|
||||
and $tp,$hi1,$ap
|
||||
andcm $rp,$hi1,$bp
|
||||
or $ap,$bp,$np
|
||||
|
||||
sub $rp,$arrsz,$rp ; rewind rp
|
||||
subi 0,$arrsz,$idx
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
L\$copy
|
||||
ldd $idx($np),$hi0
|
||||
std,ma %r0,8($tp)
|
||||
addib,<> 8,$idx,.-8 ; L\$copy
|
||||
std,ma $hi0,8($rp)
|
||||
___
|
||||
|
||||
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
|
||||
$ablo=$ab0;
|
||||
$abhi=$ab1;
|
||||
$nmlo0=$nm0;
|
||||
$nmhi0=$nm1;
|
||||
$nmlo1="%r9";
|
||||
$nmhi1="%r8";
|
||||
|
||||
$code.=<<___;
|
||||
b L\$done
|
||||
nop
|
||||
|
||||
.ALIGN 8
|
||||
L\$parisc11
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldw -12($xfer),$ablo
|
||||
ldw -16($xfer),$hi0
|
||||
ldw -4($xfer),$nmlo0
|
||||
ldw -8($xfer),$nmhi0
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
|
||||
ldo 8($idx),$idx ; j++++
|
||||
add $ablo,$nmlo0,$nmlo0 ; discarded
|
||||
addc %r0,$nmhi0,$hi1
|
||||
ldw 4($xfer),$ablo
|
||||
ldw 0($xfer),$abhi
|
||||
nop
|
||||
|
||||
L\$1st_pa11
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw 12($xfer),$nmlo1
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw 8($xfer),$nmhi1
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
fstds ${fab1},0($xfer)
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
fstds ${fnm1},8($xfer)
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -12($xfer),$ablo
|
||||
addc %r0,$nmhi1,$hi1
|
||||
ldw -16($xfer),$abhi
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
ldw -4($xfer),$nmlo0
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldw -8($xfer),$nmhi0
|
||||
add $hi0,$ablo,$ablo
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fab0},-16($xfer)
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$1st_pa11 ; j++++
|
||||
addc %r0,$nmhi0,$hi1
|
||||
|
||||
ldw 8($xfer),$nmhi1
|
||||
ldw 12($xfer),$nmlo1
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
add $hi0,$ablo,$ablo
|
||||
fstds ${fab1},0($xfer)
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fnm1},8($xfer)
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
ldw -16($xfer),$abhi
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
ldw -12($xfer),$ablo
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -8($xfer),$nmhi0
|
||||
addc %r0,$nmhi1,$hi1
|
||||
ldw -4($xfer),$nmlo0
|
||||
|
||||
add $hi0,$ablo,$ablo
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldws,mb 8($xfer),$nmhi1
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$nmlo1
|
||||
addc %r0,$nmhi0,$hi1
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
|
||||
ldo -1($num),$num ; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
|
||||
fldws,ma 4($bp),${fbi} ; bp[1]
|
||||
flddx $idx($ap),${fai} ; ap[0,1]
|
||||
flddx $idx($np),${fni} ; np[0,1]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
add $hi0,$ablo,$ablo
|
||||
addc %r0,$abhi,$hi0
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$hi1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
add $hi1,$hi0,$hi0
|
||||
addc %r0,%r0,$hi1
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
L\$outer_pa11
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
||||
fstds ${fab0},-16($xfer) ; 33-bit value
|
||||
fstds ${fnm0},-8($xfer)
|
||||
flddx $idx($ap),${fai} ; ap[2,3]
|
||||
flddx $idx($np),${fni} ; np[2,3]
|
||||
ldw -16($xfer),$abhi ; carry bit actually
|
||||
ldo 8($idx),$idx ; j++++
|
||||
ldw -12($xfer),$ablo
|
||||
ldw -8($xfer),$nmhi0
|
||||
ldw -4($xfer),$nmlo0
|
||||
ldw 0($xfer),$hi0 ; high part
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $abhi,$hi0,$hi0 ; account carry bit
|
||||
fstds ${fnm1},8($xfer)
|
||||
add $ablo,$nmlo0,$nmlo0 ; discarded
|
||||
ldw 0($tp),$ti1 ; tp[1]
|
||||
addc %r0,$nmhi0,$hi1
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
ldw 4($xfer),$ablo
|
||||
ldw 0($xfer),$abhi
|
||||
|
||||
L\$inner_pa11
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
addc %r0,$abhi,$abhi
|
||||
ldw 12($xfer),$nmlo1
|
||||
add $ti1,$ablo,$ablo
|
||||
ldw 8($xfer),$nmhi1
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fab1},0($xfer)
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
fstds ${fnm1},8($xfer)
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
ldw -12($xfer),$ablo
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -16($xfer),$abhi
|
||||
addc %r0,$nmhi1,$hi1
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldw -4($xfer),$nmlo0
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw -8($xfer),$nmhi0
|
||||
addc %r0,$abhi,$abhi
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
add $ti0,$ablo,$ablo
|
||||
fstds ${fab0},-16($xfer)
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$inner_pa11 ; j++++
|
||||
addc %r0,$nmhi0,$hi1
|
||||
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
|
||||
ldw 12($xfer),$nmlo1
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
ldw 8($xfer),$nmhi1
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
addc %r0,$abhi,$abhi
|
||||
fstds ${fab1},0($xfer)
|
||||
add $ti1,$ablo,$ablo
|
||||
fstds ${fnm1},8($xfer)
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw -16($xfer),$abhi
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
ldw -12($xfer),$ablo
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
ldw -8($xfer),$nmhi0
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -4($xfer),$nmlo0
|
||||
addc %r0,$nmhi1,$hi1
|
||||
|
||||
add $hi0,$ablo,$ablo
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
addc %r0,$abhi,$abhi
|
||||
add $ti0,$ablo,$ablo
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldws,mb 8($xfer),$nmhi1
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$nmlo1
|
||||
addc %r0,$nmhi0,$hi1
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
|
||||
addib,= -1,$num,L\$outerdone_pa11; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
|
||||
fldws,ma 4($bp),${fbi} ; bp[i]
|
||||
flddx $idx($ap),${fai} ; ap[0]
|
||||
add $hi0,$ablo,$ablo
|
||||
addc %r0,$abhi,$abhi
|
||||
flddx $idx($np),${fni} ; np[0]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
add $ti1,$ablo,$ablo
|
||||
addc %r0,$abhi,$hi0
|
||||
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$hi1
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
add $hi1,$hi0,$hi0
|
||||
addc %r0,%r0,$hi1
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
add $ti0,$hi0,$hi0
|
||||
addc %r0,$hi1,$hi1
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
|
||||
b L\$outer_pa11
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
|
||||
L\$outerdone_pa11
|
||||
add $hi0,$ablo,$ablo
|
||||
addc %r0,$abhi,$abhi
|
||||
add $ti1,$ablo,$ablo
|
||||
addc %r0,$abhi,$hi0
|
||||
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$hi1
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
|
||||
add $hi1,$hi0,$hi0
|
||||
addc %r0,%r0,$hi1
|
||||
add $ti0,$hi0,$hi0
|
||||
addc %r0,$hi1,$hi1
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
sub %r0,%r0,%r0 ; clear borrow
|
||||
ldw -4($tp),$ti0
|
||||
addl $tp,$arrsz,$tp
|
||||
L\$sub_pa11
|
||||
ldwx $idx($np),$hi0
|
||||
subb $ti0,$hi0,$hi1
|
||||
ldwx $idx($tp),$ti0
|
||||
addib,<> 4,$idx,L\$sub_pa11
|
||||
stws,ma $hi1,4($rp)
|
||||
|
||||
subb $ti0,%r0,$hi1
|
||||
ldo -4($tp),$tp
|
||||
and $tp,$hi1,$ap
|
||||
andcm $rp,$hi1,$bp
|
||||
or $ap,$bp,$np
|
||||
|
||||
sub $rp,$arrsz,$rp ; rewind rp
|
||||
subi 0,$arrsz,$idx
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
L\$copy_pa11
|
||||
ldwx $idx($np),$hi0
|
||||
stws,ma %r0,4($tp)
|
||||
addib,<> 4,$idx,L\$copy_pa11
|
||||
stws,ma $hi0,4($rp)
|
||||
|
||||
nop ; alignment
|
||||
L\$done
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
ldi 1,%r28 ; signal "handled"
|
||||
ldo $FRAME($fp),%sp ; destroy tp[num+1]
|
||||
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
||||
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
||||
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
||||
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
||||
L\$abort
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
|
||||
# that it can be compiled with .LEVEL 1.0. It should be noted that I
|
||||
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
|
||||
# directive...
|
||||
|
||||
my $ldd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "ldd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
|
||||
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
|
||||
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
|
||||
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
|
||||
$opcode|=(1<<5) if ($mod =~ /^,m/);
|
||||
$opcode|=(1<<13) if ($mod =~ /^,mb/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $std = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "std$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
|
||||
{ my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
|
||||
$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
|
||||
$opcode|=(1<<5) if ($mod =~ /^,m/);
|
||||
$opcode|=(1<<13) if ($mod =~ /^,mb/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $extrd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "extrd$mod\t$args";
|
||||
|
||||
# I only have ",u" completer, it's implicitly encoded...
|
||||
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
|
||||
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
|
||||
my $len=32-$3;
|
||||
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
|
||||
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
|
||||
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
|
||||
my $len=32-$2;
|
||||
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
|
||||
$opcode |= (1<<13) if ($mod =~ /,\**=/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $shrpd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "shrpd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
|
||||
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
|
||||
my $cpos=63-$3;
|
||||
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $sub = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "sub$mod\t$args";
|
||||
|
||||
if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
|
||||
my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
|
||||
$opcode|=(1<<10); # e1
|
||||
$opcode|=(1<<8); # e2
|
||||
$opcode|=(1<<5); # d
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
sub assemble {
|
||||
my ($mnemonic,$mod,$args)=@_;
|
||||
my $opcode = eval("\$$mnemonic");
|
||||
|
||||
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
# flip word order in 64-bit mode...
|
||||
s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
|
||||
# assemble 2.0 instructions in 32-bit mode...
|
||||
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
|
||||
|
||||
s/\bbv\b/bve/gm if ($SIZE_T==8);
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
||||
335
openssl-1.0.2f/crypto/bn/asm/ppc-mont.pl
Normal file
335
openssl-1.0.2f/crypto/bn/asm/ppc-mont.pl
Normal file
@@ -0,0 +1,335 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# April 2006
|
||||
|
||||
# "Teaser" Montgomery multiplication module for PowerPC. It's possible
|
||||
# to gain a bit more by modulo-scheduling outer loop, then dedicated
|
||||
# squaring procedure should give further 20% and code can be adapted
|
||||
# for 32-bit application running on 64-bit CPU. As for the latter.
|
||||
# It won't be able to achieve "native" 64-bit performance, because in
|
||||
# 32-bit application context every addc instruction will have to be
|
||||
# expanded as addc, twice right shift by 32 and finally adde, etc.
|
||||
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
|
||||
# for 64-bit application running on PPC970/G5 is:
|
||||
#
|
||||
# 512-bit +65%
|
||||
# 1024-bit +35%
|
||||
# 2048-bit +18%
|
||||
# 4096-bit +4%
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
if ($flavour =~ /32/) {
|
||||
$BITS= 32;
|
||||
$BNSZ= $BITS/8;
|
||||
$SIZE_T=4;
|
||||
$RZONE= 224;
|
||||
|
||||
$LD= "lwz"; # load
|
||||
$LDU= "lwzu"; # load and update
|
||||
$LDX= "lwzx"; # load indexed
|
||||
$ST= "stw"; # store
|
||||
$STU= "stwu"; # store and update
|
||||
$STX= "stwx"; # store indexed
|
||||
$STUX= "stwux"; # store indexed and update
|
||||
$UMULL= "mullw"; # unsigned multiply low
|
||||
$UMULH= "mulhwu"; # unsigned multiply high
|
||||
$UCMP= "cmplw"; # unsigned compare
|
||||
$SHRI= "srwi"; # unsigned shift right by immediate
|
||||
$PUSH= $ST;
|
||||
$POP= $LD;
|
||||
} elsif ($flavour =~ /64/) {
|
||||
$BITS= 64;
|
||||
$BNSZ= $BITS/8;
|
||||
$SIZE_T=8;
|
||||
$RZONE= 288;
|
||||
|
||||
# same as above, but 64-bit mnemonics...
|
||||
$LD= "ld"; # load
|
||||
$LDU= "ldu"; # load and update
|
||||
$LDX= "ldx"; # load indexed
|
||||
$ST= "std"; # store
|
||||
$STU= "stdu"; # store and update
|
||||
$STX= "stdx"; # store indexed
|
||||
$STUX= "stdux"; # store indexed and update
|
||||
$UMULL= "mulld"; # unsigned multiply low
|
||||
$UMULH= "mulhdu"; # unsigned multiply high
|
||||
$UCMP= "cmpld"; # unsigned compare
|
||||
$SHRI= "srdi"; # unsigned shift right by immediate
|
||||
$PUSH= $ST;
|
||||
$POP= $LD;
|
||||
} else { die "nonsense $flavour"; }
|
||||
|
||||
$FRAME=8*$SIZE_T+$RZONE;
|
||||
$LOCALS=8*$SIZE_T;
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
||||
die "can't locate ppc-xlate.pl";
|
||||
|
||||
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
|
||||
|
||||
$sp="r1";
|
||||
$toc="r2";
|
||||
$rp="r3"; $ovf="r3";
|
||||
$ap="r4";
|
||||
$bp="r5";
|
||||
$np="r6";
|
||||
$n0="r7";
|
||||
$num="r8";
|
||||
$rp="r9"; # $rp is reassigned
|
||||
$aj="r10";
|
||||
$nj="r11";
|
||||
$tj="r12";
|
||||
# non-volatile registers
|
||||
$i="r20";
|
||||
$j="r21";
|
||||
$tp="r22";
|
||||
$m0="r23";
|
||||
$m1="r24";
|
||||
$lo0="r25";
|
||||
$hi0="r26";
|
||||
$lo1="r27";
|
||||
$hi1="r28";
|
||||
$alo="r29";
|
||||
$ahi="r30";
|
||||
$nlo="r31";
|
||||
#
|
||||
$nhi="r0";
|
||||
|
||||
$code=<<___;
|
||||
.machine "any"
|
||||
.text
|
||||
|
||||
.globl .bn_mul_mont_int
|
||||
.align 4
|
||||
.bn_mul_mont_int:
|
||||
cmpwi $num,4
|
||||
mr $rp,r3 ; $rp is reassigned
|
||||
li r3,0
|
||||
bltlr
|
||||
___
|
||||
$code.=<<___ if ($BNSZ==4);
|
||||
cmpwi $num,32 ; longer key performance is not better
|
||||
bgelr
|
||||
___
|
||||
$code.=<<___;
|
||||
slwi $num,$num,`log($BNSZ)/log(2)`
|
||||
li $tj,-4096
|
||||
addi $ovf,$num,$FRAME
|
||||
subf $ovf,$ovf,$sp ; $sp-$ovf
|
||||
and $ovf,$ovf,$tj ; minimize TLB usage
|
||||
subf $ovf,$sp,$ovf ; $ovf-$sp
|
||||
mr $tj,$sp
|
||||
srwi $num,$num,`log($BNSZ)/log(2)`
|
||||
$STUX $sp,$sp,$ovf
|
||||
|
||||
$PUSH r20,`-12*$SIZE_T`($tj)
|
||||
$PUSH r21,`-11*$SIZE_T`($tj)
|
||||
$PUSH r22,`-10*$SIZE_T`($tj)
|
||||
$PUSH r23,`-9*$SIZE_T`($tj)
|
||||
$PUSH r24,`-8*$SIZE_T`($tj)
|
||||
$PUSH r25,`-7*$SIZE_T`($tj)
|
||||
$PUSH r26,`-6*$SIZE_T`($tj)
|
||||
$PUSH r27,`-5*$SIZE_T`($tj)
|
||||
$PUSH r28,`-4*$SIZE_T`($tj)
|
||||
$PUSH r29,`-3*$SIZE_T`($tj)
|
||||
$PUSH r30,`-2*$SIZE_T`($tj)
|
||||
$PUSH r31,`-1*$SIZE_T`($tj)
|
||||
|
||||
$LD $n0,0($n0) ; pull n0[0] value
|
||||
addi $num,$num,-2 ; adjust $num for counter register
|
||||
|
||||
$LD $m0,0($bp) ; m0=bp[0]
|
||||
$LD $aj,0($ap) ; ap[0]
|
||||
addi $tp,$sp,$LOCALS
|
||||
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
|
||||
$UMULH $hi0,$aj,$m0
|
||||
|
||||
$LD $aj,$BNSZ($ap) ; ap[1]
|
||||
$LD $nj,0($np) ; np[0]
|
||||
|
||||
$UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
|
||||
|
||||
$UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
|
||||
$UMULH $ahi,$aj,$m0
|
||||
|
||||
$UMULL $lo1,$nj,$m1 ; np[0]*m1
|
||||
$UMULH $hi1,$nj,$m1
|
||||
$LD $nj,$BNSZ($np) ; np[1]
|
||||
addc $lo1,$lo1,$lo0
|
||||
addze $hi1,$hi1
|
||||
|
||||
$UMULL $nlo,$nj,$m1 ; np[1]*m1
|
||||
$UMULH $nhi,$nj,$m1
|
||||
|
||||
mtctr $num
|
||||
li $j,`2*$BNSZ`
|
||||
.align 4
|
||||
L1st:
|
||||
$LDX $aj,$ap,$j ; ap[j]
|
||||
addc $lo0,$alo,$hi0
|
||||
$LDX $nj,$np,$j ; np[j]
|
||||
addze $hi0,$ahi
|
||||
$UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
|
||||
addc $lo1,$nlo,$hi1
|
||||
$UMULH $ahi,$aj,$m0
|
||||
addze $hi1,$nhi
|
||||
$UMULL $nlo,$nj,$m1 ; np[j]*m1
|
||||
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
|
||||
$UMULH $nhi,$nj,$m1
|
||||
addze $hi1,$hi1
|
||||
$ST $lo1,0($tp) ; tp[j-1]
|
||||
|
||||
addi $j,$j,$BNSZ ; j++
|
||||
addi $tp,$tp,$BNSZ ; tp++
|
||||
bdnz- L1st
|
||||
;L1st
|
||||
addc $lo0,$alo,$hi0
|
||||
addze $hi0,$ahi
|
||||
|
||||
addc $lo1,$nlo,$hi1
|
||||
addze $hi1,$nhi
|
||||
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
|
||||
addze $hi1,$hi1
|
||||
$ST $lo1,0($tp) ; tp[j-1]
|
||||
|
||||
li $ovf,0
|
||||
addc $hi1,$hi1,$hi0
|
||||
addze $ovf,$ovf ; upmost overflow bit
|
||||
$ST $hi1,$BNSZ($tp)
|
||||
|
||||
li $i,$BNSZ
|
||||
.align 4
|
||||
Louter:
|
||||
$LDX $m0,$bp,$i ; m0=bp[i]
|
||||
$LD $aj,0($ap) ; ap[0]
|
||||
addi $tp,$sp,$LOCALS
|
||||
$LD $tj,$LOCALS($sp); tp[0]
|
||||
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
|
||||
$UMULH $hi0,$aj,$m0
|
||||
$LD $aj,$BNSZ($ap) ; ap[1]
|
||||
$LD $nj,0($np) ; np[0]
|
||||
addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
|
||||
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
|
||||
addze $hi0,$hi0
|
||||
$UMULL $m1,$lo0,$n0 ; tp[0]*n0
|
||||
$UMULH $ahi,$aj,$m0
|
||||
$UMULL $lo1,$nj,$m1 ; np[0]*m1
|
||||
$UMULH $hi1,$nj,$m1
|
||||
$LD $nj,$BNSZ($np) ; np[1]
|
||||
addc $lo1,$lo1,$lo0
|
||||
$UMULL $nlo,$nj,$m1 ; np[1]*m1
|
||||
addze $hi1,$hi1
|
||||
$UMULH $nhi,$nj,$m1
|
||||
|
||||
mtctr $num
|
||||
li $j,`2*$BNSZ`
|
||||
.align 4
|
||||
Linner:
|
||||
$LDX $aj,$ap,$j ; ap[j]
|
||||
addc $lo0,$alo,$hi0
|
||||
$LD $tj,$BNSZ($tp) ; tp[j]
|
||||
addze $hi0,$ahi
|
||||
$LDX $nj,$np,$j ; np[j]
|
||||
addc $lo1,$nlo,$hi1
|
||||
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
|
||||
addze $hi1,$nhi
|
||||
$UMULH $ahi,$aj,$m0
|
||||
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
|
||||
$UMULL $nlo,$nj,$m1 ; np[j]*m1
|
||||
addze $hi0,$hi0
|
||||
$UMULH $nhi,$nj,$m1
|
||||
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
addi $j,$j,$BNSZ ; j++
|
||||
addze $hi1,$hi1
|
||||
$ST $lo1,0($tp) ; tp[j-1]
|
||||
addi $tp,$tp,$BNSZ ; tp++
|
||||
bdnz- Linner
|
||||
;Linner
|
||||
$LD $tj,$BNSZ($tp) ; tp[j]
|
||||
addc $lo0,$alo,$hi0
|
||||
addze $hi0,$ahi
|
||||
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
|
||||
addze $hi0,$hi0
|
||||
|
||||
addc $lo1,$nlo,$hi1
|
||||
addze $hi1,$nhi
|
||||
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
addze $hi1,$hi1
|
||||
$ST $lo1,0($tp) ; tp[j-1]
|
||||
|
||||
addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
|
||||
li $ovf,0
|
||||
adde $hi1,$hi1,$hi0
|
||||
addze $ovf,$ovf
|
||||
$ST $hi1,$BNSZ($tp)
|
||||
;
|
||||
slwi $tj,$num,`log($BNSZ)/log(2)`
|
||||
$UCMP $i,$tj
|
||||
addi $i,$i,$BNSZ
|
||||
ble- Louter
|
||||
|
||||
addi $num,$num,2 ; restore $num
|
||||
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
|
||||
addi $tp,$sp,$LOCALS
|
||||
mtctr $num
|
||||
|
||||
.align 4
|
||||
Lsub: $LDX $tj,$tp,$j
|
||||
$LDX $nj,$np,$j
|
||||
subfe $aj,$nj,$tj ; tp[j]-np[j]
|
||||
$STX $aj,$rp,$j
|
||||
addi $j,$j,$BNSZ
|
||||
bdnz- Lsub
|
||||
|
||||
li $j,0
|
||||
mtctr $num
|
||||
subfe $ovf,$j,$ovf ; handle upmost overflow bit
|
||||
and $ap,$tp,$ovf
|
||||
andc $np,$rp,$ovf
|
||||
or $ap,$ap,$np ; ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
Lcopy: ; copy or in-place refresh
|
||||
$LDX $tj,$ap,$j
|
||||
$STX $tj,$rp,$j
|
||||
$STX $j,$tp,$j ; zap at once
|
||||
addi $j,$j,$BNSZ
|
||||
bdnz- Lcopy
|
||||
|
||||
$POP $tj,0($sp)
|
||||
li r3,1
|
||||
$POP r20,`-12*$SIZE_T`($tj)
|
||||
$POP r21,`-11*$SIZE_T`($tj)
|
||||
$POP r22,`-10*$SIZE_T`($tj)
|
||||
$POP r23,`-9*$SIZE_T`($tj)
|
||||
$POP r24,`-8*$SIZE_T`($tj)
|
||||
$POP r25,`-7*$SIZE_T`($tj)
|
||||
$POP r26,`-6*$SIZE_T`($tj)
|
||||
$POP r27,`-5*$SIZE_T`($tj)
|
||||
$POP r28,`-4*$SIZE_T`($tj)
|
||||
$POP r29,`-3*$SIZE_T`($tj)
|
||||
$POP r30,`-2*$SIZE_T`($tj)
|
||||
$POP r31,`-1*$SIZE_T`($tj)
|
||||
mr $sp,$tj
|
||||
blr
|
||||
.long 0
|
||||
.byte 0,12,4,0,0x80,12,6,0
|
||||
.long 0
|
||||
.size .bn_mul_mont_int,.-.bn_mul_mont_int
|
||||
|
||||
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
||||
2008
openssl-1.0.2f/crypto/bn/asm/ppc.pl
Normal file
2008
openssl-1.0.2f/crypto/bn/asm/ppc.pl
Normal file
File diff suppressed because it is too large
Load Diff
1628
openssl-1.0.2f/crypto/bn/asm/ppc64-mont.pl
Normal file
1628
openssl-1.0.2f/crypto/bn/asm/ppc64-mont.pl
Normal file
File diff suppressed because it is too large
Load Diff
1898
openssl-1.0.2f/crypto/bn/asm/rsaz-avx2.pl
Executable file
1898
openssl-1.0.2f/crypto/bn/asm/rsaz-avx2.pl
Executable file
File diff suppressed because it is too large
Load Diff
2144
openssl-1.0.2f/crypto/bn/asm/rsaz-x86_64.pl
Executable file
2144
openssl-1.0.2f/crypto/bn/asm/rsaz-x86_64.pl
Executable file
File diff suppressed because it is too large
Load Diff
221
openssl-1.0.2f/crypto/bn/asm/s390x-gf2m.pl
Normal file
221
openssl-1.0.2f/crypto/bn/asm/s390x-gf2m.pl
Normal file
@@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# May 2011
|
||||
#
|
||||
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
|
||||
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
|
||||
# the time being... gcc 4.3 appeared to generate poor code, therefore
|
||||
# the effort. And indeed, the module delivers 55%-90%(*) improvement
|
||||
# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
|
||||
# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
|
||||
# This is for 64-bit build. In 32-bit "highgprs" case improvement is
|
||||
# even higher, for example on z990 it was measured 80%-150%. ECDSA
|
||||
# sign is modest 9%-12% faster. Keep in mind that these coefficients
|
||||
# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
|
||||
# burnt in it...
|
||||
#
|
||||
# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
|
||||
# so that improvement coefficients can vary from one specific
|
||||
# setup to another.
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
if ($flavour =~ /3[12]/) {
|
||||
$SIZE_T=4;
|
||||
$g="";
|
||||
} else {
|
||||
$SIZE_T=8;
|
||||
$g="g";
|
||||
}
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$stdframe=16*$SIZE_T+4*8;
|
||||
|
||||
$rp="%r2";
|
||||
$a1="%r3";
|
||||
$a0="%r4";
|
||||
$b1="%r5";
|
||||
$b0="%r6";
|
||||
|
||||
$ra="%r14";
|
||||
$sp="%r15";
|
||||
|
||||
@T=("%r0","%r1");
|
||||
@i=("%r12","%r13");
|
||||
|
||||
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
|
||||
($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.type _mul_1x1,\@function
|
||||
.align 16
|
||||
_mul_1x1:
|
||||
lgr $a1,$a
|
||||
sllg $a2,$a,1
|
||||
sllg $a4,$a,2
|
||||
sllg $a8,$a,3
|
||||
|
||||
srag $lo,$a1,63 # broadcast 63rd bit
|
||||
nihh $a1,0x1fff
|
||||
srag @i[0],$a2,63 # broadcast 62nd bit
|
||||
nihh $a2,0x3fff
|
||||
srag @i[1],$a4,63 # broadcast 61st bit
|
||||
nihh $a4,0x7fff
|
||||
ngr $lo,$b
|
||||
ngr @i[0],$b
|
||||
ngr @i[1],$b
|
||||
|
||||
lghi @T[0],0
|
||||
lgr $a12,$a1
|
||||
stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
|
||||
xgr $a12,$a2
|
||||
stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
|
||||
lgr $a48,$a4
|
||||
stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
|
||||
xgr $a48,$a8
|
||||
stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
|
||||
xgr $a1,$a4
|
||||
|
||||
stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
|
||||
xgr $a2,$a4
|
||||
stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
|
||||
xgr $a12,$a4
|
||||
stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
|
||||
xgr $a1,$a48
|
||||
stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
|
||||
xgr $a2,$a48
|
||||
|
||||
stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
|
||||
xgr $a12,$a48
|
||||
stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
|
||||
xgr $a1,$a4
|
||||
stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
|
||||
xgr $a2,$a4
|
||||
stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
|
||||
|
||||
xgr $a12,$a4
|
||||
stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
|
||||
srlg $hi,$lo,1
|
||||
stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
|
||||
sllg $lo,$lo,63
|
||||
stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
|
||||
srlg @T[0],@i[0],2
|
||||
stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
|
||||
|
||||
lghi $mask,`0xf<<3`
|
||||
sllg $a1,@i[0],62
|
||||
sllg @i[0],$b,3
|
||||
srlg @T[1],@i[1],3
|
||||
ngr @i[0],$mask
|
||||
sllg $a2,@i[1],61
|
||||
srlg @i[1],$b,4-3
|
||||
xgr $hi,@T[0]
|
||||
ngr @i[1],$mask
|
||||
xgr $lo,$a1
|
||||
xgr $hi,@T[1]
|
||||
xgr $lo,$a2
|
||||
|
||||
xg $lo,$stdframe(@i[0],$sp)
|
||||
srlg @i[0],$b,8-3
|
||||
ngr @i[0],$mask
|
||||
___
|
||||
for($n=1;$n<14;$n++) {
|
||||
$code.=<<___;
|
||||
lg @T[1],$stdframe(@i[1],$sp)
|
||||
srlg @i[1],$b,`($n+2)*4`-3
|
||||
sllg @T[0],@T[1],`$n*4`
|
||||
ngr @i[1],$mask
|
||||
srlg @T[1],@T[1],`64-$n*4`
|
||||
xgr $lo,@T[0]
|
||||
xgr $hi,@T[1]
|
||||
___
|
||||
push(@i,shift(@i)); push(@T,shift(@T));
|
||||
}
|
||||
$code.=<<___;
|
||||
lg @T[1],$stdframe(@i[1],$sp)
|
||||
sllg @T[0],@T[1],`$n*4`
|
||||
srlg @T[1],@T[1],`64-$n*4`
|
||||
xgr $lo,@T[0]
|
||||
xgr $hi,@T[1]
|
||||
|
||||
lg @T[0],$stdframe(@i[0],$sp)
|
||||
sllg @T[1],@T[0],`($n+1)*4`
|
||||
srlg @T[0],@T[0],`64-($n+1)*4`
|
||||
xgr $lo,@T[1]
|
||||
xgr $hi,@T[0]
|
||||
|
||||
br $ra
|
||||
.size _mul_1x1,.-_mul_1x1
|
||||
|
||||
.globl bn_GF2m_mul_2x2
|
||||
.type bn_GF2m_mul_2x2,\@function
|
||||
.align 16
|
||||
bn_GF2m_mul_2x2:
|
||||
stm${g} %r3,%r15,3*$SIZE_T($sp)
|
||||
|
||||
lghi %r1,-$stdframe-128
|
||||
la %r0,0($sp)
|
||||
la $sp,0(%r1,$sp) # alloca
|
||||
st${g} %r0,0($sp) # back chain
|
||||
___
|
||||
if ($SIZE_T==8) {
|
||||
my @r=map("%r$_",(6..9));
|
||||
$code.=<<___;
|
||||
bras $ra,_mul_1x1 # a1·b1
|
||||
stmg $lo,$hi,16($rp)
|
||||
|
||||
lg $a,`$stdframe+128+4*$SIZE_T`($sp)
|
||||
lg $b,`$stdframe+128+6*$SIZE_T`($sp)
|
||||
bras $ra,_mul_1x1 # a0·b0
|
||||
stmg $lo,$hi,0($rp)
|
||||
|
||||
lg $a,`$stdframe+128+3*$SIZE_T`($sp)
|
||||
lg $b,`$stdframe+128+5*$SIZE_T`($sp)
|
||||
xg $a,`$stdframe+128+4*$SIZE_T`($sp)
|
||||
xg $b,`$stdframe+128+6*$SIZE_T`($sp)
|
||||
bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
|
||||
lmg @r[0],@r[3],0($rp)
|
||||
|
||||
xgr $lo,$hi
|
||||
xgr $hi,@r[1]
|
||||
xgr $lo,@r[0]
|
||||
xgr $hi,@r[2]
|
||||
xgr $lo,@r[3]
|
||||
xgr $hi,@r[3]
|
||||
xgr $lo,$hi
|
||||
stg $hi,16($rp)
|
||||
stg $lo,8($rp)
|
||||
___
|
||||
} else {
|
||||
$code.=<<___;
|
||||
sllg %r3,%r3,32
|
||||
sllg %r5,%r5,32
|
||||
or %r3,%r4
|
||||
or %r5,%r6
|
||||
bras $ra,_mul_1x1
|
||||
rllg $lo,$lo,32
|
||||
rllg $hi,$hi,32
|
||||
stmg $lo,$hi,0($rp)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
|
||||
br $ra
|
||||
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
|
||||
.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
||||
277
openssl-1.0.2f/crypto/bn/asm/s390x-mont.pl
Normal file
277
openssl-1.0.2f/crypto/bn/asm/s390x-mont.pl
Normal file
@@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# April 2007.
|
||||
#
|
||||
# Performance improvement over vanilla C code varies from 85% to 45%
|
||||
# depending on key length and benchmark. Unfortunately in this context
|
||||
# these are not very impressive results [for code that utilizes "wide"
|
||||
# 64x64=128-bit multiplication, which is not commonly available to C
|
||||
# programmers], at least hand-coded bn_asm.c replacement is known to
|
||||
# provide 30-40% better results for longest keys. Well, on a second
|
||||
# thought it's not very surprising, because z-CPUs are single-issue
|
||||
# and _strictly_ in-order execution, while bn_mul_mont is more or less
|
||||
# dependent on CPU ability to pipe-line instructions and have several
|
||||
# of them "in-flight" at the same time. I mean while other methods,
|
||||
# for example Karatsuba, aim to minimize amount of multiplications at
|
||||
# the cost of other operations increase, bn_mul_mont aim to neatly
|
||||
# "overlap" multiplications and the other operations [and on most
|
||||
# platforms even minimize the amount of the other operations, in
|
||||
# particular references to memory]. But it's possible to improve this
|
||||
# module performance by implementing dedicated squaring code-path and
|
||||
# possibly by unrolling loops...
|
||||
|
||||
# January 2009.
|
||||
#
|
||||
# Reschedule to minimize/avoid Address Generation Interlock hazard,
|
||||
# make inner loops counter-based.
|
||||
|
||||
# November 2010.
|
||||
#
|
||||
# Adapt for -m31 build. If kernel supports what's called "highgprs"
|
||||
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
|
||||
# instructions and achieve "64-bit" performance even in 31-bit legacy
|
||||
# application context. The feature is not specific to any particular
|
||||
# processor, as long as it's "z-CPU". Latter implies that the code
|
||||
# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
|
||||
# is achieved by swapping words after 64-bit loads, follow _dswap-s.
|
||||
# On z990 it was measured to perform 2.6-2.2 times better than
|
||||
# compiler-generated code, less for longer keys...
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
if ($flavour =~ /3[12]/) {
|
||||
$SIZE_T=4;
|
||||
$g="";
|
||||
} else {
|
||||
$SIZE_T=8;
|
||||
$g="g";
|
||||
}
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$stdframe=16*$SIZE_T+4*8;
|
||||
|
||||
$mn0="%r0";
|
||||
$num="%r1";
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp="%r2"; # BN_ULONG *rp,
|
||||
$ap="%r3"; # const BN_ULONG *ap,
|
||||
$bp="%r4"; # const BN_ULONG *bp,
|
||||
$np="%r5"; # const BN_ULONG *np,
|
||||
$n0="%r6"; # const BN_ULONG *n0,
|
||||
#$num="160(%r15)" # int num);
|
||||
|
||||
$bi="%r2"; # zaps rp
|
||||
$j="%r7";
|
||||
|
||||
$ahi="%r8";
|
||||
$alo="%r9";
|
||||
$nhi="%r10";
|
||||
$nlo="%r11";
|
||||
$AHI="%r12";
|
||||
$NHI="%r13";
|
||||
$count="%r14";
|
||||
$sp="%r15";
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
.globl bn_mul_mont
|
||||
.type bn_mul_mont,\@function
|
||||
bn_mul_mont:
|
||||
lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
|
||||
sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
|
||||
la $bp,0($num,$bp)
|
||||
|
||||
st${g} %r2,2*$SIZE_T($sp)
|
||||
|
||||
cghi $num,16 #
|
||||
lghi %r2,0 #
|
||||
blr %r14 # if($num<16) return 0;
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /3[12]/);
|
||||
tmll $num,4
|
||||
bnzr %r14 # if ($num&1) return 0;
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /3[12]/);
|
||||
cghi $num,96 #
|
||||
bhr %r14 # if($num>96) return 0;
|
||||
___
|
||||
$code.=<<___;
|
||||
stm${g} %r3,%r15,3*$SIZE_T($sp)
|
||||
|
||||
lghi $rp,-$stdframe-8 # leave room for carry bit
|
||||
lcgr $j,$num # -$num
|
||||
lgr %r0,$sp
|
||||
la $rp,0($rp,$sp)
|
||||
la $sp,0($j,$rp) # alloca
|
||||
st${g} %r0,0($sp) # back chain
|
||||
|
||||
sra $num,3 # restore $num
|
||||
la $bp,0($j,$bp) # restore $bp
|
||||
ahi $num,-1 # adjust $num for inner loop
|
||||
lg $n0,0($n0) # pull n0
|
||||
_dswap $n0
|
||||
|
||||
lg $bi,0($bp)
|
||||
_dswap $bi
|
||||
lg $alo,0($ap)
|
||||
_dswap $alo
|
||||
mlgr $ahi,$bi # ap[0]*bp[0]
|
||||
lgr $AHI,$ahi
|
||||
|
||||
lgr $mn0,$alo # "tp[0]"*n0
|
||||
msgr $mn0,$n0
|
||||
|
||||
lg $nlo,0($np) #
|
||||
_dswap $nlo
|
||||
mlgr $nhi,$mn0 # np[0]*m1
|
||||
algr $nlo,$alo # +="tp[0]"
|
||||
lghi $NHI,0
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
la $j,8(%r0) # j=1
|
||||
lr $count,$num
|
||||
|
||||
.align 16
|
||||
.L1st:
|
||||
lg $alo,0($j,$ap)
|
||||
_dswap $alo
|
||||
mlgr $ahi,$bi # ap[j]*bp[0]
|
||||
algr $alo,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$ahi
|
||||
|
||||
lg $nlo,0($j,$np)
|
||||
_dswap $nlo
|
||||
mlgr $nhi,$mn0 # np[j]*m1
|
||||
algr $nlo,$NHI
|
||||
lghi $NHI,0
|
||||
alcgr $nhi,$NHI # +="tp[j]"
|
||||
algr $nlo,$alo
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
|
||||
la $j,8($j) # j++
|
||||
brct $count,.L1st
|
||||
|
||||
algr $NHI,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$AHI # upmost overflow bit
|
||||
stg $NHI,$stdframe-8($j,$sp)
|
||||
stg $AHI,$stdframe($j,$sp)
|
||||
la $bp,8($bp) # bp++
|
||||
|
||||
.Louter:
|
||||
lg $bi,0($bp) # bp[i]
|
||||
_dswap $bi
|
||||
lg $alo,0($ap)
|
||||
_dswap $alo
|
||||
mlgr $ahi,$bi # ap[0]*bp[i]
|
||||
alg $alo,$stdframe($sp) # +=tp[0]
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$ahi
|
||||
|
||||
lgr $mn0,$alo
|
||||
msgr $mn0,$n0 # tp[0]*n0
|
||||
|
||||
lg $nlo,0($np) # np[0]
|
||||
_dswap $nlo
|
||||
mlgr $nhi,$mn0 # np[0]*m1
|
||||
algr $nlo,$alo # +="tp[0]"
|
||||
lghi $NHI,0
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
la $j,8(%r0) # j=1
|
||||
lr $count,$num
|
||||
|
||||
.align 16
|
||||
.Linner:
|
||||
lg $alo,0($j,$ap)
|
||||
_dswap $alo
|
||||
mlgr $ahi,$bi # ap[j]*bp[i]
|
||||
algr $alo,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $ahi,$AHI
|
||||
alg $alo,$stdframe($j,$sp)# +=tp[j]
|
||||
alcgr $AHI,$ahi
|
||||
|
||||
lg $nlo,0($j,$np)
|
||||
_dswap $nlo
|
||||
mlgr $nhi,$mn0 # np[j]*m1
|
||||
algr $nlo,$NHI
|
||||
lghi $NHI,0
|
||||
alcgr $nhi,$NHI
|
||||
algr $nlo,$alo # +="tp[j]"
|
||||
alcgr $NHI,$nhi
|
||||
|
||||
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
|
||||
la $j,8($j) # j++
|
||||
brct $count,.Linner
|
||||
|
||||
algr $NHI,$AHI
|
||||
lghi $AHI,0
|
||||
alcgr $AHI,$AHI
|
||||
alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
|
||||
lghi $ahi,0
|
||||
alcgr $AHI,$ahi # new upmost overflow bit
|
||||
stg $NHI,$stdframe-8($j,$sp)
|
||||
stg $AHI,$stdframe($j,$sp)
|
||||
|
||||
la $bp,8($bp) # bp++
|
||||
cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
|
||||
jne .Louter
|
||||
|
||||
l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
|
||||
la $ap,$stdframe($sp)
|
||||
ahi $num,1 # restore $num, incidentally clears "borrow"
|
||||
|
||||
la $j,0(%r0)
|
||||
lr $count,$num
|
||||
.Lsub: lg $alo,0($j,$ap)
|
||||
lg $nlo,0($j,$np)
|
||||
_dswap $nlo
|
||||
slbgr $alo,$nlo
|
||||
stg $alo,0($j,$rp)
|
||||
la $j,8($j)
|
||||
brct $count,.Lsub
|
||||
lghi $ahi,0
|
||||
slbgr $AHI,$ahi # handle upmost carry
|
||||
|
||||
ngr $ap,$AHI
|
||||
lghi $np,-1
|
||||
xgr $np,$AHI
|
||||
ngr $np,$rp
|
||||
ogr $ap,$np # ap=borrow?tp:rp
|
||||
|
||||
la $j,0(%r0)
|
||||
lgr $count,$num
|
||||
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
|
||||
_dswap $alo
|
||||
stg $j,$stdframe($j,$sp) # zap tp
|
||||
stg $alo,0($j,$rp)
|
||||
la $j,8($j)
|
||||
brct $count,.Lcopy
|
||||
|
||||
la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
|
||||
lm${g} %r6,%r15,0(%r1)
|
||||
lghi %r2,1 # signal "processed"
|
||||
br %r14
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
||||
713
openssl-1.0.2f/crypto/bn/asm/s390x.S
Executable file
713
openssl-1.0.2f/crypto/bn/asm/s390x.S
Executable file
@@ -0,0 +1,713 @@
|
||||
.ident "s390x.S, version 1.1"
|
||||
// ====================================================================
|
||||
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
// project.
|
||||
//
|
||||
// Rights for redistribution and usage in source and binary forms are
|
||||
// granted according to the OpenSSL license. Warranty of any kind is
|
||||
// disclaimed.
|
||||
// ====================================================================
|
||||
|
||||
.text
|
||||
|
||||
#define zero %r0
|
||||
|
||||
// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
|
||||
.globl bn_mul_add_words
|
||||
.type bn_mul_add_words,@function
|
||||
.align 4
|
||||
bn_mul_add_words:
|
||||
lghi zero,0 // zero = 0
|
||||
la %r1,0(%r2) // put rp aside [to give way to]
|
||||
lghi %r2,0 // return value
|
||||
ltgfr %r4,%r4
|
||||
bler %r14 // if (len<=0) return 0;
|
||||
|
||||
stmg %r6,%r13,48(%r15)
|
||||
lghi %r2,3
|
||||
lghi %r12,0 // carry = 0
|
||||
slgr %r1,%r3 // rp-=ap
|
||||
nr %r2,%r4 // len%4
|
||||
sra %r4,2 // cnt=len/4
|
||||
jz .Loop1_madd // carry is incidentally cleared if branch taken
|
||||
algr zero,zero // clear carry
|
||||
|
||||
lg %r7,0(%r3) // ap[0]
|
||||
lg %r9,8(%r3) // ap[1]
|
||||
mlgr %r6,%r5 // *=w
|
||||
brct %r4,.Loop4_madd
|
||||
j .Loop4_madd_tail
|
||||
|
||||
.Loop4_madd:
|
||||
mlgr %r8,%r5
|
||||
lg %r11,16(%r3) // ap[i+2]
|
||||
alcgr %r7,%r12 // +=carry
|
||||
alcgr %r6,zero
|
||||
alg %r7,0(%r3,%r1) // +=rp[i]
|
||||
stg %r7,0(%r3,%r1) // rp[i]=
|
||||
|
||||
mlgr %r10,%r5
|
||||
lg %r13,24(%r3)
|
||||
alcgr %r9,%r6
|
||||
alcgr %r8,zero
|
||||
alg %r9,8(%r3,%r1)
|
||||
stg %r9,8(%r3,%r1)
|
||||
|
||||
mlgr %r12,%r5
|
||||
lg %r7,32(%r3)
|
||||
alcgr %r11,%r8
|
||||
alcgr %r10,zero
|
||||
alg %r11,16(%r3,%r1)
|
||||
stg %r11,16(%r3,%r1)
|
||||
|
||||
mlgr %r6,%r5
|
||||
lg %r9,40(%r3)
|
||||
alcgr %r13,%r10
|
||||
alcgr %r12,zero
|
||||
alg %r13,24(%r3,%r1)
|
||||
stg %r13,24(%r3,%r1)
|
||||
|
||||
la %r3,32(%r3) // i+=4
|
||||
brct %r4,.Loop4_madd
|
||||
|
||||
.Loop4_madd_tail:
|
||||
mlgr %r8,%r5
|
||||
lg %r11,16(%r3)
|
||||
alcgr %r7,%r12 // +=carry
|
||||
alcgr %r6,zero
|
||||
alg %r7,0(%r3,%r1) // +=rp[i]
|
||||
stg %r7,0(%r3,%r1) // rp[i]=
|
||||
|
||||
mlgr %r10,%r5
|
||||
lg %r13,24(%r3)
|
||||
alcgr %r9,%r6
|
||||
alcgr %r8,zero
|
||||
alg %r9,8(%r3,%r1)
|
||||
stg %r9,8(%r3,%r1)
|
||||
|
||||
mlgr %r12,%r5
|
||||
alcgr %r11,%r8
|
||||
alcgr %r10,zero
|
||||
alg %r11,16(%r3,%r1)
|
||||
stg %r11,16(%r3,%r1)
|
||||
|
||||
alcgr %r13,%r10
|
||||
alcgr %r12,zero
|
||||
alg %r13,24(%r3,%r1)
|
||||
stg %r13,24(%r3,%r1)
|
||||
|
||||
la %r3,32(%r3) // i+=4
|
||||
|
||||
la %r2,1(%r2) // see if len%4 is zero ...
|
||||
brct %r2,.Loop1_madd // without touching condition code:-)
|
||||
|
||||
.Lend_madd:
|
||||
lgr %r2,zero // return value
|
||||
alcgr %r2,%r12 // collect even carry bit
|
||||
lmg %r6,%r13,48(%r15)
|
||||
br %r14
|
||||
|
||||
.Loop1_madd:
|
||||
lg %r7,0(%r3) // ap[i]
|
||||
mlgr %r6,%r5 // *=w
|
||||
alcgr %r7,%r12 // +=carry
|
||||
alcgr %r6,zero
|
||||
alg %r7,0(%r3,%r1) // +=rp[i]
|
||||
stg %r7,0(%r3,%r1) // rp[i]=
|
||||
|
||||
lgr %r12,%r6
|
||||
la %r3,8(%r3) // i++
|
||||
brct %r2,.Loop1_madd
|
||||
|
||||
j .Lend_madd
|
||||
.size bn_mul_add_words,.-bn_mul_add_words
|
||||
|
||||
// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
|
||||
.globl bn_mul_words
|
||||
.type bn_mul_words,@function
|
||||
.align 4
|
||||
bn_mul_words:
|
||||
lghi zero,0 // zero = 0
|
||||
la %r1,0(%r2) // put rp aside
|
||||
lghi %r2,0 // i=0;
|
||||
ltgfr %r4,%r4
|
||||
bler %r14 // if (len<=0) return 0;
|
||||
|
||||
stmg %r6,%r10,48(%r15)
|
||||
lghi %r10,3
|
||||
lghi %r8,0 // carry = 0
|
||||
nr %r10,%r4 // len%4
|
||||
sra %r4,2 // cnt=len/4
|
||||
jz .Loop1_mul // carry is incidentally cleared if branch taken
|
||||
algr zero,zero // clear carry
|
||||
|
||||
.Loop4_mul:
|
||||
lg %r7,0(%r2,%r3) // ap[i]
|
||||
mlgr %r6,%r5 // *=w
|
||||
alcgr %r7,%r8 // +=carry
|
||||
stg %r7,0(%r2,%r1) // rp[i]=
|
||||
|
||||
lg %r9,8(%r2,%r3)
|
||||
mlgr %r8,%r5
|
||||
alcgr %r9,%r6
|
||||
stg %r9,8(%r2,%r1)
|
||||
|
||||
lg %r7,16(%r2,%r3)
|
||||
mlgr %r6,%r5
|
||||
alcgr %r7,%r8
|
||||
stg %r7,16(%r2,%r1)
|
||||
|
||||
lg %r9,24(%r2,%r3)
|
||||
mlgr %r8,%r5
|
||||
alcgr %r9,%r6
|
||||
stg %r9,24(%r2,%r1)
|
||||
|
||||
la %r2,32(%r2) // i+=4
|
||||
brct %r4,.Loop4_mul
|
||||
|
||||
la %r10,1(%r10) // see if len%4 is zero ...
|
||||
brct %r10,.Loop1_mul // without touching condition code:-)
|
||||
|
||||
.Lend_mul:
|
||||
alcgr %r8,zero // collect carry bit
|
||||
lgr %r2,%r8
|
||||
lmg %r6,%r10,48(%r15)
|
||||
br %r14
|
||||
|
||||
.Loop1_mul:
|
||||
lg %r7,0(%r2,%r3) // ap[i]
|
||||
mlgr %r6,%r5 // *=w
|
||||
alcgr %r7,%r8 // +=carry
|
||||
stg %r7,0(%r2,%r1) // rp[i]=
|
||||
|
||||
lgr %r8,%r6
|
||||
la %r2,8(%r2) // i++
|
||||
brct %r10,.Loop1_mul
|
||||
|
||||
j .Lend_mul
|
||||
.size bn_mul_words,.-bn_mul_words
|
||||
|
||||
// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
|
||||
.globl bn_sqr_words
|
||||
.type bn_sqr_words,@function
|
||||
.align 4
|
||||
bn_sqr_words:
|
||||
ltgfr %r4,%r4
|
||||
bler %r14
|
||||
|
||||
stmg %r6,%r7,48(%r15)
|
||||
srag %r1,%r4,2 // cnt=len/4
|
||||
jz .Loop1_sqr
|
||||
|
||||
.Loop4_sqr:
|
||||
lg %r7,0(%r3)
|
||||
mlgr %r6,%r7
|
||||
stg %r7,0(%r2)
|
||||
stg %r6,8(%r2)
|
||||
|
||||
lg %r7,8(%r3)
|
||||
mlgr %r6,%r7
|
||||
stg %r7,16(%r2)
|
||||
stg %r6,24(%r2)
|
||||
|
||||
lg %r7,16(%r3)
|
||||
mlgr %r6,%r7
|
||||
stg %r7,32(%r2)
|
||||
stg %r6,40(%r2)
|
||||
|
||||
lg %r7,24(%r3)
|
||||
mlgr %r6,%r7
|
||||
stg %r7,48(%r2)
|
||||
stg %r6,56(%r2)
|
||||
|
||||
la %r3,32(%r3)
|
||||
la %r2,64(%r2)
|
||||
brct %r1,.Loop4_sqr
|
||||
|
||||
lghi %r1,3
|
||||
nr %r4,%r1 // cnt=len%4
|
||||
jz .Lend_sqr
|
||||
|
||||
.Loop1_sqr:
|
||||
lg %r7,0(%r3)
|
||||
mlgr %r6,%r7
|
||||
stg %r7,0(%r2)
|
||||
stg %r6,8(%r2)
|
||||
|
||||
la %r3,8(%r3)
|
||||
la %r2,16(%r2)
|
||||
brct %r4,.Loop1_sqr
|
||||
|
||||
.Lend_sqr:
|
||||
lmg %r6,%r7,48(%r15)
|
||||
br %r14
|
||||
.size bn_sqr_words,.-bn_sqr_words
|
||||
|
||||
// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
|
||||
.globl bn_div_words
|
||||
.type bn_div_words,@function
|
||||
.align 4
|
||||
bn_div_words:
|
||||
dlgr %r2,%r4
|
||||
lgr %r2,%r3
|
||||
br %r14
|
||||
.size bn_div_words,.-bn_div_words
|
||||
|
||||
// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
|
||||
.globl bn_add_words
|
||||
.type bn_add_words,@function
|
||||
.align 4
|
||||
bn_add_words:
|
||||
la %r1,0(%r2) // put rp aside
|
||||
lghi %r2,0 // i=0
|
||||
ltgfr %r5,%r5
|
||||
bler %r14 // if (len<=0) return 0;
|
||||
|
||||
stg %r6,48(%r15)
|
||||
lghi %r6,3
|
||||
nr %r6,%r5 // len%4
|
||||
sra %r5,2 // len/4, use sra because it sets condition code
|
||||
jz .Loop1_add // carry is incidentally cleared if branch taken
|
||||
algr %r2,%r2 // clear carry
|
||||
|
||||
.Loop4_add:
|
||||
lg %r0,0(%r2,%r3)
|
||||
alcg %r0,0(%r2,%r4)
|
||||
stg %r0,0(%r2,%r1)
|
||||
lg %r0,8(%r2,%r3)
|
||||
alcg %r0,8(%r2,%r4)
|
||||
stg %r0,8(%r2,%r1)
|
||||
lg %r0,16(%r2,%r3)
|
||||
alcg %r0,16(%r2,%r4)
|
||||
stg %r0,16(%r2,%r1)
|
||||
lg %r0,24(%r2,%r3)
|
||||
alcg %r0,24(%r2,%r4)
|
||||
stg %r0,24(%r2,%r1)
|
||||
|
||||
la %r2,32(%r2) // i+=4
|
||||
brct %r5,.Loop4_add
|
||||
|
||||
la %r6,1(%r6) // see if len%4 is zero ...
|
||||
brct %r6,.Loop1_add // without touching condition code:-)
|
||||
|
||||
.Lexit_add:
|
||||
lghi %r2,0
|
||||
alcgr %r2,%r2
|
||||
lg %r6,48(%r15)
|
||||
br %r14
|
||||
|
||||
.Loop1_add:
|
||||
lg %r0,0(%r2,%r3)
|
||||
alcg %r0,0(%r2,%r4)
|
||||
stg %r0,0(%r2,%r1)
|
||||
|
||||
la %r2,8(%r2) // i++
|
||||
brct %r6,.Loop1_add
|
||||
|
||||
j .Lexit_add
|
||||
.size bn_add_words,.-bn_add_words
|
||||
|
||||
// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
|
||||
.globl bn_sub_words
|
||||
.type bn_sub_words,@function
|
||||
.align 4
|
||||
bn_sub_words:
|
||||
la %r1,0(%r2) // put rp aside
|
||||
lghi %r2,0 // i=0
|
||||
ltgfr %r5,%r5
|
||||
bler %r14 // if (len<=0) return 0;
|
||||
|
||||
stg %r6,48(%r15)
|
||||
lghi %r6,3
|
||||
nr %r6,%r5 // len%4
|
||||
sra %r5,2 // len/4, use sra because it sets condition code
|
||||
jnz .Loop4_sub // borrow is incidentally cleared if branch taken
|
||||
slgr %r2,%r2 // clear borrow
|
||||
|
||||
.Loop1_sub:
|
||||
lg %r0,0(%r2,%r3)
|
||||
slbg %r0,0(%r2,%r4)
|
||||
stg %r0,0(%r2,%r1)
|
||||
|
||||
la %r2,8(%r2) // i++
|
||||
brct %r6,.Loop1_sub
|
||||
j .Lexit_sub
|
||||
|
||||
.Loop4_sub:
|
||||
lg %r0,0(%r2,%r3)
|
||||
slbg %r0,0(%r2,%r4)
|
||||
stg %r0,0(%r2,%r1)
|
||||
lg %r0,8(%r2,%r3)
|
||||
slbg %r0,8(%r2,%r4)
|
||||
stg %r0,8(%r2,%r1)
|
||||
lg %r0,16(%r2,%r3)
|
||||
slbg %r0,16(%r2,%r4)
|
||||
stg %r0,16(%r2,%r1)
|
||||
lg %r0,24(%r2,%r3)
|
||||
slbg %r0,24(%r2,%r4)
|
||||
stg %r0,24(%r2,%r1)
|
||||
|
||||
la %r2,32(%r2) // i+=4
|
||||
brct %r5,.Loop4_sub
|
||||
|
||||
la %r6,1(%r6) // see if len%4 is zero ...
|
||||
brct %r6,.Loop1_sub // without touching condition code:-)
|
||||
|
||||
.Lexit_sub:
|
||||
lghi %r2,0
|
||||
slbgr %r2,%r2
|
||||
lcgr %r2,%r2
|
||||
lg %r6,48(%r15)
|
||||
br %r14
|
||||
.size bn_sub_words,.-bn_sub_words
|
||||
|
||||
#define c1 %r1
|
||||
#define c2 %r5
|
||||
#define c3 %r8
|
||||
|
||||
#define mul_add_c(ai,bi,c1,c2,c3) \
|
||||
lg %r7,ai*8(%r3); \
|
||||
mlg %r6,bi*8(%r4); \
|
||||
algr c1,%r7; \
|
||||
alcgr c2,%r6; \
|
||||
alcgr c3,zero
|
||||
|
||||
// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
|
||||
.globl bn_mul_comba8
|
||||
.type bn_mul_comba8,@function
|
||||
.align 4
|
||||
bn_mul_comba8:
|
||||
stmg %r6,%r8,48(%r15)
|
||||
|
||||
lghi c1,0
|
||||
lghi c2,0
|
||||
lghi c3,0
|
||||
lghi zero,0
|
||||
|
||||
mul_add_c(0,0,c1,c2,c3);
|
||||
stg c1,0*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(0,1,c2,c3,c1);
|
||||
mul_add_c(1,0,c2,c3,c1);
|
||||
stg c2,1*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(2,0,c3,c1,c2);
|
||||
mul_add_c(1,1,c3,c1,c2);
|
||||
mul_add_c(0,2,c3,c1,c2);
|
||||
stg c3,2*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(0,3,c1,c2,c3);
|
||||
mul_add_c(1,2,c1,c2,c3);
|
||||
mul_add_c(2,1,c1,c2,c3);
|
||||
mul_add_c(3,0,c1,c2,c3);
|
||||
stg c1,3*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(4,0,c2,c3,c1);
|
||||
mul_add_c(3,1,c2,c3,c1);
|
||||
mul_add_c(2,2,c2,c3,c1);
|
||||
mul_add_c(1,3,c2,c3,c1);
|
||||
mul_add_c(0,4,c2,c3,c1);
|
||||
stg c2,4*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(0,5,c3,c1,c2);
|
||||
mul_add_c(1,4,c3,c1,c2);
|
||||
mul_add_c(2,3,c3,c1,c2);
|
||||
mul_add_c(3,2,c3,c1,c2);
|
||||
mul_add_c(4,1,c3,c1,c2);
|
||||
mul_add_c(5,0,c3,c1,c2);
|
||||
stg c3,5*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(6,0,c1,c2,c3);
|
||||
mul_add_c(5,1,c1,c2,c3);
|
||||
mul_add_c(4,2,c1,c2,c3);
|
||||
mul_add_c(3,3,c1,c2,c3);
|
||||
mul_add_c(2,4,c1,c2,c3);
|
||||
mul_add_c(1,5,c1,c2,c3);
|
||||
mul_add_c(0,6,c1,c2,c3);
|
||||
stg c1,6*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(0,7,c2,c3,c1);
|
||||
mul_add_c(1,6,c2,c3,c1);
|
||||
mul_add_c(2,5,c2,c3,c1);
|
||||
mul_add_c(3,4,c2,c3,c1);
|
||||
mul_add_c(4,3,c2,c3,c1);
|
||||
mul_add_c(5,2,c2,c3,c1);
|
||||
mul_add_c(6,1,c2,c3,c1);
|
||||
mul_add_c(7,0,c2,c3,c1);
|
||||
stg c2,7*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(7,1,c3,c1,c2);
|
||||
mul_add_c(6,2,c3,c1,c2);
|
||||
mul_add_c(5,3,c3,c1,c2);
|
||||
mul_add_c(4,4,c3,c1,c2);
|
||||
mul_add_c(3,5,c3,c1,c2);
|
||||
mul_add_c(2,6,c3,c1,c2);
|
||||
mul_add_c(1,7,c3,c1,c2);
|
||||
stg c3,8*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(2,7,c1,c2,c3);
|
||||
mul_add_c(3,6,c1,c2,c3);
|
||||
mul_add_c(4,5,c1,c2,c3);
|
||||
mul_add_c(5,4,c1,c2,c3);
|
||||
mul_add_c(6,3,c1,c2,c3);
|
||||
mul_add_c(7,2,c1,c2,c3);
|
||||
stg c1,9*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(7,3,c2,c3,c1);
|
||||
mul_add_c(6,4,c2,c3,c1);
|
||||
mul_add_c(5,5,c2,c3,c1);
|
||||
mul_add_c(4,6,c2,c3,c1);
|
||||
mul_add_c(3,7,c2,c3,c1);
|
||||
stg c2,10*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(4,7,c3,c1,c2);
|
||||
mul_add_c(5,6,c3,c1,c2);
|
||||
mul_add_c(6,5,c3,c1,c2);
|
||||
mul_add_c(7,4,c3,c1,c2);
|
||||
stg c3,11*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(7,5,c1,c2,c3);
|
||||
mul_add_c(6,6,c1,c2,c3);
|
||||
mul_add_c(5,7,c1,c2,c3);
|
||||
stg c1,12*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
|
||||
mul_add_c(6,7,c2,c3,c1);
|
||||
mul_add_c(7,6,c2,c3,c1);
|
||||
stg c2,13*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(7,7,c3,c1,c2);
|
||||
stg c3,14*8(%r2)
|
||||
stg c1,15*8(%r2)
|
||||
|
||||
lmg %r6,%r8,48(%r15)
|
||||
br %r14
|
||||
.size bn_mul_comba8,.-bn_mul_comba8
|
||||
|
||||
// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
|
||||
.globl bn_mul_comba4
|
||||
.type bn_mul_comba4,@function
|
||||
.align 4
|
||||
bn_mul_comba4:
|
||||
stmg %r6,%r8,48(%r15)
|
||||
|
||||
lghi c1,0
|
||||
lghi c2,0
|
||||
lghi c3,0
|
||||
lghi zero,0
|
||||
|
||||
mul_add_c(0,0,c1,c2,c3);
|
||||
stg c1,0*8(%r3)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(0,1,c2,c3,c1);
|
||||
mul_add_c(1,0,c2,c3,c1);
|
||||
stg c2,1*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(2,0,c3,c1,c2);
|
||||
mul_add_c(1,1,c3,c1,c2);
|
||||
mul_add_c(0,2,c3,c1,c2);
|
||||
stg c3,2*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(0,3,c1,c2,c3);
|
||||
mul_add_c(1,2,c1,c2,c3);
|
||||
mul_add_c(2,1,c1,c2,c3);
|
||||
mul_add_c(3,0,c1,c2,c3);
|
||||
stg c1,3*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
mul_add_c(3,1,c2,c3,c1);
|
||||
mul_add_c(2,2,c2,c3,c1);
|
||||
mul_add_c(1,3,c2,c3,c1);
|
||||
stg c2,4*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
mul_add_c(2,3,c3,c1,c2);
|
||||
mul_add_c(3,2,c3,c1,c2);
|
||||
stg c3,5*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
mul_add_c(3,3,c1,c2,c3);
|
||||
stg c1,6*8(%r2)
|
||||
stg c2,7*8(%r2)
|
||||
|
||||
stmg %r6,%r8,48(%r15)
|
||||
br %r14
|
||||
.size bn_mul_comba4,.-bn_mul_comba4
|
||||
|
||||
#define sqr_add_c(ai,c1,c2,c3) \
|
||||
lg %r7,ai*8(%r3); \
|
||||
mlgr %r6,%r7; \
|
||||
algr c1,%r7; \
|
||||
alcgr c2,%r6; \
|
||||
alcgr c3,zero
|
||||
|
||||
#define sqr_add_c2(ai,aj,c1,c2,c3) \
|
||||
lg %r7,ai*8(%r3); \
|
||||
mlg %r6,aj*8(%r3); \
|
||||
algr c1,%r7; \
|
||||
alcgr c2,%r6; \
|
||||
alcgr c3,zero; \
|
||||
algr c1,%r7; \
|
||||
alcgr c2,%r6; \
|
||||
alcgr c3,zero
|
||||
|
||||
// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
|
||||
.globl bn_sqr_comba8
|
||||
.type bn_sqr_comba8,@function
|
||||
.align 4
|
||||
bn_sqr_comba8:
|
||||
stmg %r6,%r8,48(%r15)
|
||||
|
||||
lghi c1,0
|
||||
lghi c2,0
|
||||
lghi c3,0
|
||||
lghi zero,0
|
||||
|
||||
sqr_add_c(0,c1,c2,c3);
|
||||
stg c1,0*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c2(1,0,c2,c3,c1);
|
||||
stg c2,1*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c(1,c3,c1,c2);
|
||||
sqr_add_c2(2,0,c3,c1,c2);
|
||||
stg c3,2*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c2(3,0,c1,c2,c3);
|
||||
sqr_add_c2(2,1,c1,c2,c3);
|
||||
stg c1,3*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c(2,c2,c3,c1);
|
||||
sqr_add_c2(3,1,c2,c3,c1);
|
||||
sqr_add_c2(4,0,c2,c3,c1);
|
||||
stg c2,4*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c2(5,0,c3,c1,c2);
|
||||
sqr_add_c2(4,1,c3,c1,c2);
|
||||
sqr_add_c2(3,2,c3,c1,c2);
|
||||
stg c3,5*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c(3,c1,c2,c3);
|
||||
sqr_add_c2(4,2,c1,c2,c3);
|
||||
sqr_add_c2(5,1,c1,c2,c3);
|
||||
sqr_add_c2(6,0,c1,c2,c3);
|
||||
stg c1,6*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c2(7,0,c2,c3,c1);
|
||||
sqr_add_c2(6,1,c2,c3,c1);
|
||||
sqr_add_c2(5,2,c2,c3,c1);
|
||||
sqr_add_c2(4,3,c2,c3,c1);
|
||||
stg c2,7*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c(4,c3,c1,c2);
|
||||
sqr_add_c2(5,3,c3,c1,c2);
|
||||
sqr_add_c2(6,2,c3,c1,c2);
|
||||
sqr_add_c2(7,1,c3,c1,c2);
|
||||
stg c3,8*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c2(7,2,c1,c2,c3);
|
||||
sqr_add_c2(6,3,c1,c2,c3);
|
||||
sqr_add_c2(5,4,c1,c2,c3);
|
||||
stg c1,9*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c(5,c2,c3,c1);
|
||||
sqr_add_c2(6,4,c2,c3,c1);
|
||||
sqr_add_c2(7,3,c2,c3,c1);
|
||||
stg c2,10*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c2(7,4,c3,c1,c2);
|
||||
sqr_add_c2(6,5,c3,c1,c2);
|
||||
stg c3,11*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c(6,c1,c2,c3);
|
||||
sqr_add_c2(7,5,c1,c2,c3);
|
||||
stg c1,12*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c2(7,6,c2,c3,c1);
|
||||
stg c2,13*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c(7,c3,c1,c2);
|
||||
stg c3,14*8(%r2)
|
||||
stg c1,15*8(%r2)
|
||||
|
||||
lmg %r6,%r8,48(%r15)
|
||||
br %r14
|
||||
.size bn_sqr_comba8,.-bn_sqr_comba8
|
||||
|
||||
// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
|
||||
.globl bn_sqr_comba4
|
||||
.type bn_sqr_comba4,@function
|
||||
.align 4
|
||||
bn_sqr_comba4:
|
||||
stmg %r6,%r8,48(%r15)
|
||||
|
||||
lghi c1,0
|
||||
lghi c2,0
|
||||
lghi c3,0
|
||||
lghi zero,0
|
||||
|
||||
sqr_add_c(0,c1,c2,c3);
|
||||
stg c1,0*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c2(1,0,c2,c3,c1);
|
||||
stg c2,1*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c(1,c3,c1,c2);
|
||||
sqr_add_c2(2,0,c3,c1,c2);
|
||||
stg c3,2*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c2(3,0,c1,c2,c3);
|
||||
sqr_add_c2(2,1,c1,c2,c3);
|
||||
stg c1,3*8(%r2)
|
||||
lghi c1,0
|
||||
|
||||
sqr_add_c(2,c2,c3,c1);
|
||||
sqr_add_c2(3,1,c2,c3,c1);
|
||||
stg c2,4*8(%r2)
|
||||
lghi c2,0
|
||||
|
||||
sqr_add_c2(3,2,c3,c1,c2);
|
||||
stg c3,5*8(%r2)
|
||||
lghi c3,0
|
||||
|
||||
sqr_add_c(3,c1,c2,c3);
|
||||
stg c1,6*8(%r2)
|
||||
stg c2,7*8(%r2)
|
||||
|
||||
lmg %r6,%r8,48(%r15)
|
||||
br %r14
|
||||
.size bn_sqr_comba4,.-bn_sqr_comba4
|
||||
1222
openssl-1.0.2f/crypto/bn/asm/sparct4-mont.pl
Executable file
1222
openssl-1.0.2f/crypto/bn/asm/sparct4-mont.pl
Executable file
File diff suppressed because it is too large
Load Diff
1458
openssl-1.0.2f/crypto/bn/asm/sparcv8.S
Normal file
1458
openssl-1.0.2f/crypto/bn/asm/sparcv8.S
Normal file
File diff suppressed because it is too large
Load Diff
1558
openssl-1.0.2f/crypto/bn/asm/sparcv8plus.S
Normal file
1558
openssl-1.0.2f/crypto/bn/asm/sparcv8plus.S
Normal file
File diff suppressed because it is too large
Load Diff
190
openssl-1.0.2f/crypto/bn/asm/sparcv9-gf2m.pl
Normal file
190
openssl-1.0.2f/crypto/bn/asm/sparcv9-gf2m.pl
Normal file
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# October 2012
|
||||
#
|
||||
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
|
||||
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
|
||||
# the time being... Except that it has two code paths: one suitable
|
||||
# for all SPARCv9 processors and one for VIS3-capable ones. Former
|
||||
# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
|
||||
# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
|
||||
# ~100-230% faster than gcc-generated code and ~35-90% faster than
|
||||
# the pure SPARCv9 code path.
|
||||
|
||||
$locals=16*8;
|
||||
|
||||
$tab="%l0";
|
||||
|
||||
@T=("%g2","%g3");
|
||||
@i=("%g4","%g5");
|
||||
|
||||
($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
|
||||
($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
|
||||
|
||||
$code.=<<___;
|
||||
#include <sparc_arch.h>
|
||||
|
||||
#ifdef __arch64__
|
||||
.register %g2,#scratch
|
||||
.register %g3,#scratch
|
||||
#endif
|
||||
|
||||
#ifdef __PIC__
|
||||
SPARC_PIC_THUNK(%g1)
|
||||
#endif
|
||||
|
||||
.globl bn_GF2m_mul_2x2
|
||||
.align 16
|
||||
bn_GF2m_mul_2x2:
|
||||
SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
|
||||
ld [%g1+0],%g1 ! OPENSSL_sparcv9cap_P[0]
|
||||
|
||||
andcc %g1, SPARCV9_VIS3, %g0
|
||||
bz,pn %icc,.Lsoftware
|
||||
nop
|
||||
|
||||
sllx %o1, 32, %o1
|
||||
sllx %o3, 32, %o3
|
||||
or %o2, %o1, %o1
|
||||
or %o4, %o3, %o3
|
||||
.word 0x95b262ab ! xmulx %o1, %o3, %o2
|
||||
.word 0x99b262cb ! xmulxhi %o1, %o3, %o4
|
||||
srlx %o2, 32, %o1 ! 13 cycles later
|
||||
st %o2, [%o0+0]
|
||||
st %o1, [%o0+4]
|
||||
srlx %o4, 32, %o3
|
||||
st %o4, [%o0+8]
|
||||
retl
|
||||
st %o3, [%o0+12]
|
||||
|
||||
.align 16
|
||||
.Lsoftware:
|
||||
save %sp,-STACK_FRAME-$locals,%sp
|
||||
|
||||
sllx %i1,32,$a
|
||||
mov -1,$a12
|
||||
sllx %i3,32,$b
|
||||
or %i2,$a,$a
|
||||
srlx $a12,1,$a48 ! 0x7fff...
|
||||
or %i4,$b,$b
|
||||
srlx $a12,2,$a12 ! 0x3fff...
|
||||
add %sp,STACK_BIAS+STACK_FRAME,$tab
|
||||
|
||||
sllx $a,2,$a4
|
||||
mov $a,$a1
|
||||
sllx $a,1,$a2
|
||||
|
||||
srax $a4,63,@i[1] ! broadcast 61st bit
|
||||
and $a48,$a4,$a4 ! (a<<2)&0x7fff...
|
||||
srlx $a48,2,$a48
|
||||
srax $a2,63,@i[0] ! broadcast 62nd bit
|
||||
and $a12,$a2,$a2 ! (a<<1)&0x3fff...
|
||||
srax $a1,63,$lo ! broadcast 63rd bit
|
||||
and $a48,$a1,$a1 ! (a<<0)&0x1fff...
|
||||
|
||||
sllx $a1,3,$a8
|
||||
and $b,$lo,$lo
|
||||
and $b,@i[0],@i[0]
|
||||
and $b,@i[1],@i[1]
|
||||
|
||||
stx %g0,[$tab+0*8] ! tab[0]=0
|
||||
xor $a1,$a2,$a12
|
||||
stx $a1,[$tab+1*8] ! tab[1]=a1
|
||||
stx $a2,[$tab+2*8] ! tab[2]=a2
|
||||
xor $a4,$a8,$a48
|
||||
stx $a12,[$tab+3*8] ! tab[3]=a1^a2
|
||||
xor $a4,$a1,$a1
|
||||
|
||||
stx $a4,[$tab+4*8] ! tab[4]=a4
|
||||
xor $a4,$a2,$a2
|
||||
stx $a1,[$tab+5*8] ! tab[5]=a1^a4
|
||||
xor $a4,$a12,$a12
|
||||
stx $a2,[$tab+6*8] ! tab[6]=a2^a4
|
||||
xor $a48,$a1,$a1
|
||||
stx $a12,[$tab+7*8] ! tab[7]=a1^a2^a4
|
||||
xor $a48,$a2,$a2
|
||||
|
||||
stx $a8,[$tab+8*8] ! tab[8]=a8
|
||||
xor $a48,$a12,$a12
|
||||
stx $a1,[$tab+9*8] ! tab[9]=a1^a8
|
||||
xor $a4,$a1,$a1
|
||||
stx $a2,[$tab+10*8] ! tab[10]=a2^a8
|
||||
xor $a4,$a2,$a2
|
||||
stx $a12,[$tab+11*8] ! tab[11]=a1^a2^a8
|
||||
|
||||
xor $a4,$a12,$a12
|
||||
stx $a48,[$tab+12*8] ! tab[12]=a4^a8
|
||||
srlx $lo,1,$hi
|
||||
stx $a1,[$tab+13*8] ! tab[13]=a1^a4^a8
|
||||
sllx $lo,63,$lo
|
||||
stx $a2,[$tab+14*8] ! tab[14]=a2^a4^a8
|
||||
srlx @i[0],2,@T[0]
|
||||
stx $a12,[$tab+15*8] ! tab[15]=a1^a2^a4^a8
|
||||
|
||||
sllx @i[0],62,$a1
|
||||
sllx $b,3,@i[0]
|
||||
srlx @i[1],3,@T[1]
|
||||
and @i[0],`0xf<<3`,@i[0]
|
||||
sllx @i[1],61,$a2
|
||||
ldx [$tab+@i[0]],@i[0]
|
||||
srlx $b,4-3,@i[1]
|
||||
xor @T[0],$hi,$hi
|
||||
and @i[1],`0xf<<3`,@i[1]
|
||||
xor $a1,$lo,$lo
|
||||
ldx [$tab+@i[1]],@i[1]
|
||||
xor @T[1],$hi,$hi
|
||||
|
||||
xor @i[0],$lo,$lo
|
||||
srlx $b,8-3,@i[0]
|
||||
xor $a2,$lo,$lo
|
||||
and @i[0],`0xf<<3`,@i[0]
|
||||
___
|
||||
for($n=1;$n<14;$n++) {
|
||||
$code.=<<___;
|
||||
sllx @i[1],`$n*4`,@T[0]
|
||||
ldx [$tab+@i[0]],@i[0]
|
||||
srlx @i[1],`64-$n*4`,@T[1]
|
||||
xor @T[0],$lo,$lo
|
||||
srlx $b,`($n+2)*4`-3,@i[1]
|
||||
xor @T[1],$hi,$hi
|
||||
and @i[1],`0xf<<3`,@i[1]
|
||||
___
|
||||
push(@i,shift(@i)); push(@T,shift(@T));
|
||||
}
|
||||
$code.=<<___;
|
||||
sllx @i[1],`$n*4`,@T[0]
|
||||
ldx [$tab+@i[0]],@i[0]
|
||||
srlx @i[1],`64-$n*4`,@T[1]
|
||||
xor @T[0],$lo,$lo
|
||||
|
||||
sllx @i[0],`($n+1)*4`,@T[0]
|
||||
xor @T[1],$hi,$hi
|
||||
srlx @i[0],`64-($n+1)*4`,@T[1]
|
||||
xor @T[0],$lo,$lo
|
||||
xor @T[1],$hi,$hi
|
||||
|
||||
srlx $lo,32,%i1
|
||||
st $lo,[%i0+0]
|
||||
st %i1,[%i0+4]
|
||||
srlx $hi,32,%i2
|
||||
st $hi,[%i0+8]
|
||||
st %i2,[%i0+12]
|
||||
|
||||
ret
|
||||
restore
|
||||
.type bn_GF2m_mul_2x2,#function
|
||||
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
|
||||
.asciz "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 4
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
||||
606
openssl-1.0.2f/crypto/bn/asm/sparcv9-mont.pl
Normal file
606
openssl-1.0.2f/crypto/bn/asm/sparcv9-mont.pl
Normal file
@@ -0,0 +1,606 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# December 2005
|
||||
#
|
||||
# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
|
||||
# for undertaken effort are multiple. First of all, UltraSPARC is not
|
||||
# the whole SPARCv9 universe and other VIS-free implementations deserve
|
||||
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
|
||||
# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
|
||||
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
|
||||
# several integrated RSA/DSA accelerator circuits accessible through
|
||||
# kernel driver [only(*)], but having decent user-land software
|
||||
# implementation is important too. Finally, reasons like desire to
|
||||
# experiment with dedicated squaring procedure. Yes, this module
|
||||
# implements one, because it was easiest to draft it in SPARCv9
|
||||
# instructions...
|
||||
|
||||
# (*) Engine accessing the driver in question is on my TODO list.
|
||||
# For reference, acceleator is estimated to give 6 to 10 times
|
||||
# improvement on single-threaded RSA sign. It should be noted
|
||||
# that 6-10x improvement coefficient does not actually mean
|
||||
# something extraordinary in terms of absolute [single-threaded]
|
||||
# performance, as SPARCv9 instruction set is by all means least
|
||||
# suitable for high performance crypto among other 64 bit
|
||||
# platforms. 6-10x factor simply places T1 in same performance
|
||||
# domain as say AMD64 and IA-64. Improvement of RSA verify don't
|
||||
# appear impressive at all, but it's the sign operation which is
|
||||
# far more critical/interesting.
|
||||
|
||||
# You might notice that inner loops are modulo-scheduled:-) This has
|
||||
# essentially negligible impact on UltraSPARC performance, it's
|
||||
# Fujitsu SPARC64 V users who should notice and hopefully appreciate
|
||||
# the advantage... Currently this module surpasses sparcv9a-mont.pl
|
||||
# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
|
||||
# module still have hidden potential [see TODO list there], which is
|
||||
# estimated to be larger than 20%...
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp="%i0"; # BN_ULONG *rp,
|
||||
$ap="%i1"; # const BN_ULONG *ap,
|
||||
$bp="%i2"; # const BN_ULONG *bp,
|
||||
$np="%i3"; # const BN_ULONG *np,
|
||||
$n0="%i4"; # const BN_ULONG *n0,
|
||||
$num="%i5"; # int num);
|
||||
|
||||
$bits=32;
|
||||
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
|
||||
if ($bits==64) { $bias=2047; $frame=192; }
|
||||
else { $bias=0; $frame=128; }
|
||||
|
||||
$car0="%o0";
|
||||
$car1="%o1";
|
||||
$car2="%o2"; # 1 bit
|
||||
$acc0="%o3";
|
||||
$acc1="%o4";
|
||||
$mask="%g1"; # 32 bits, what a waste...
|
||||
$tmp0="%g4";
|
||||
$tmp1="%g5";
|
||||
|
||||
$i="%l0";
|
||||
$j="%l1";
|
||||
$mul0="%l2";
|
||||
$mul1="%l3";
|
||||
$tp="%l4";
|
||||
$apj="%l5";
|
||||
$npj="%l6";
|
||||
$tpj="%l7";
|
||||
|
||||
$fname="bn_mul_mont_int";
|
||||
|
||||
$code=<<___;
|
||||
.section ".text",#alloc,#execinstr
|
||||
|
||||
.global $fname
|
||||
.align 32
|
||||
$fname:
|
||||
cmp %o5,4 ! 128 bits minimum
|
||||
bge,pt %icc,.Lenter
|
||||
sethi %hi(0xffffffff),$mask
|
||||
retl
|
||||
clr %o0
|
||||
.align 32
|
||||
.Lenter:
|
||||
save %sp,-$frame,%sp
|
||||
sll $num,2,$num ! num*=4
|
||||
or $mask,%lo(0xffffffff),$mask
|
||||
ld [$n0],$n0
|
||||
cmp $ap,$bp
|
||||
and $num,$mask,$num
|
||||
ld [$bp],$mul0 ! bp[0]
|
||||
nop
|
||||
|
||||
add %sp,$bias,%o7 ! real top of stack
|
||||
ld [$ap],$car0 ! ap[0] ! redundant in squaring context
|
||||
sub %o7,$num,%o7
|
||||
ld [$ap+4],$apj ! ap[1]
|
||||
and %o7,-1024,%o7
|
||||
ld [$np],$car1 ! np[0]
|
||||
sub %o7,$bias,%sp ! alloca
|
||||
ld [$np+4],$npj ! np[1]
|
||||
be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
|
||||
mov 12,$j
|
||||
|
||||
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
|
||||
mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
|
||||
and $car0,$mask,$acc0
|
||||
add %sp,$bias+$frame,$tp
|
||||
ld [$ap+8],$apj !prologue!
|
||||
|
||||
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
|
||||
and $mul1,$mask,$mul1
|
||||
|
||||
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
|
||||
mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
ld [$np+8],$npj !prologue!
|
||||
srlx $car1,32,$car1
|
||||
mov $tmp0,$acc0 !prologue!
|
||||
|
||||
.L1st:
|
||||
mulx $apj,$mul0,$tmp0
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $acc0,$car0,$car0
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
and $car0,$mask,$acc0
|
||||
add $acc1,$car1,$car1
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
add $j,4,$j ! j++
|
||||
mov $tmp0,$acc0
|
||||
st $car1,[$tp]
|
||||
cmp $j,$num
|
||||
mov $tmp1,$acc1
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.L1st
|
||||
add $tp,4,$tp ! tp++
|
||||
!.L1st
|
||||
|
||||
mulx $apj,$mul0,$tmp0 !epilogue!
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $acc0,$car0,$car0
|
||||
and $car0,$mask,$acc0
|
||||
add $acc1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $tmp0,$car0,$car0
|
||||
and $car0,$mask,$acc0
|
||||
add $tmp1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car1,$car1
|
||||
st $car1,[$tp+8]
|
||||
srlx $car1,32,$car2
|
||||
|
||||
mov 4,$i ! i++
|
||||
ld [$bp+4],$mul0 ! bp[1]
|
||||
.Louter:
|
||||
add %sp,$bias+$frame,$tp
|
||||
ld [$ap],$car0 ! ap[0]
|
||||
ld [$ap+4],$apj ! ap[1]
|
||||
ld [$np],$car1 ! np[0]
|
||||
ld [$np+4],$npj ! np[1]
|
||||
ld [$tp],$tmp1 ! tp[0]
|
||||
ld [$tp+4],$tpj ! tp[1]
|
||||
mov 12,$j
|
||||
|
||||
mulx $car0,$mul0,$car0
|
||||
mulx $apj,$mul0,$tmp0 !prologue!
|
||||
add $tmp1,$car0,$car0
|
||||
ld [$ap+8],$apj !prologue!
|
||||
and $car0,$mask,$acc0
|
||||
|
||||
mulx $n0,$acc0,$mul1
|
||||
and $mul1,$mask,$mul1
|
||||
|
||||
mulx $car1,$mul1,$car1
|
||||
mulx $npj,$mul1,$acc1 !prologue!
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
ld [$np+8],$npj !prologue!
|
||||
srlx $car1,32,$car1
|
||||
mov $tmp0,$acc0 !prologue!
|
||||
|
||||
.Linner:
|
||||
mulx $apj,$mul0,$tmp0
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $tpj,$car0,$car0
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
add $acc0,$car0,$car0
|
||||
add $acc1,$car1,$car1
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
and $car0,$mask,$acc0
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
add $j,4,$j ! j++
|
||||
mov $tmp0,$acc0
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
mov $tmp1,$acc1
|
||||
cmp $j,$num
|
||||
bl %icc,.Linner
|
||||
add $tp,4,$tp ! tp++
|
||||
!.Linner
|
||||
|
||||
mulx $apj,$mul0,$tmp0 !epilogue!
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $tpj,$car0,$car0
|
||||
add $acc0,$car0,$car0
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
and $car0,$mask,$acc0
|
||||
add $acc1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $tpj,$car0,$car0
|
||||
add $tmp0,$car0,$car0
|
||||
and $car0,$mask,$acc0
|
||||
add $tmp1,$car1,$car1
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp+4] ! tp[j-1]
|
||||
srlx $car0,32,$car0
|
||||
add $i,4,$i ! i++
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car1,$car1
|
||||
cmp $i,$num
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+8]
|
||||
|
||||
srlx $car1,32,$car2
|
||||
bl,a %icc,.Louter
|
||||
ld [$bp+$i],$mul0 ! bp[i]
|
||||
!.Louter
|
||||
|
||||
add $tp,12,$tp
|
||||
|
||||
.Ltail:
|
||||
add $np,$num,$np
|
||||
add $rp,$num,$rp
|
||||
mov $tp,$ap
|
||||
sub %g0,$num,%o7 ! k=-num
|
||||
ba .Lsub
|
||||
subcc %g0,%g0,%g0 ! clear %icc.c
|
||||
.align 16
|
||||
.Lsub:
|
||||
ld [$tp+%o7],%o0
|
||||
ld [$np+%o7],%o1
|
||||
subccc %o0,%o1,%o1 ! tp[j]-np[j]
|
||||
add $rp,%o7,$i
|
||||
add %o7,4,%o7
|
||||
brnz %o7,.Lsub
|
||||
st %o1,[$i]
|
||||
subc $car2,0,$car2 ! handle upmost overflow bit
|
||||
and $tp,$car2,$ap
|
||||
andn $rp,$car2,$np
|
||||
or $ap,$np,$ap
|
||||
sub %g0,$num,%o7
|
||||
|
||||
.Lcopy:
|
||||
ld [$ap+%o7],%o0 ! copy or in-place refresh
|
||||
st %g0,[$tp+%o7] ! zap tp
|
||||
st %o0,[$rp+%o7]
|
||||
add %o7,4,%o7
|
||||
brnz %o7,.Lcopy
|
||||
nop
|
||||
mov 1,%i0
|
||||
ret
|
||||
restore
|
||||
___
|
||||
|
||||
########
|
||||
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
|
||||
######## code without following dedicated squaring procedure.
|
||||
########
|
||||
$sbit="%i2"; # re-use $bp!
|
||||
|
||||
$code.=<<___;
|
||||
.align 32
|
||||
.Lbn_sqr_mont:
|
||||
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
|
||||
mulx $apj,$mul0,$tmp0 !prologue!
|
||||
and $car0,$mask,$acc0
|
||||
add %sp,$bias+$frame,$tp
|
||||
ld [$ap+8],$apj !prologue!
|
||||
|
||||
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
|
||||
srlx $car0,32,$car0
|
||||
and $mul1,$mask,$mul1
|
||||
|
||||
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
|
||||
mulx $npj,$mul1,$acc1 !prologue!
|
||||
and $car0,1,$sbit
|
||||
ld [$np+8],$npj !prologue!
|
||||
srlx $car0,1,$car0
|
||||
add $acc0,$car1,$car1
|
||||
srlx $car1,32,$car1
|
||||
mov $tmp0,$acc0 !prologue!
|
||||
|
||||
.Lsqr_1st:
|
||||
mulx $apj,$mul0,$tmp0
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $acc0,$car0,$car0 ! ap[j]*a0+c0
|
||||
add $acc1,$car1,$car1
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
and $car0,$mask,$acc0
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
mov $tmp1,$acc1
|
||||
srlx $acc0,32,$sbit
|
||||
add $j,4,$j ! j++
|
||||
and $acc0,$mask,$acc0
|
||||
cmp $j,$num
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
mov $tmp0,$acc0
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.Lsqr_1st
|
||||
add $tp,4,$tp ! tp++
|
||||
!.Lsqr_1st
|
||||
|
||||
mulx $apj,$mul0,$tmp0 ! epilogue
|
||||
mulx $npj,$mul1,$tmp1
|
||||
add $acc0,$car0,$car0 ! ap[j]*a0+c0
|
||||
add $acc1,$car1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $tmp0,$car0,$car0 ! ap[j]*a0+c0
|
||||
add $tmp1,$car1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0
|
||||
or $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
st $car1,[$tp+8]
|
||||
srlx $car1,32,$car2
|
||||
|
||||
ld [%sp+$bias+$frame],$tmp0 ! tp[0]
|
||||
ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
|
||||
ld [%sp+$bias+$frame+8],$tpj ! tp[2]
|
||||
ld [$ap+4],$mul0 ! ap[1]
|
||||
ld [$ap+8],$apj ! ap[2]
|
||||
ld [$np],$car1 ! np[0]
|
||||
ld [$np+4],$npj ! np[1]
|
||||
mulx $n0,$tmp0,$mul1
|
||||
|
||||
mulx $mul0,$mul0,$car0
|
||||
and $mul1,$mask,$mul1
|
||||
|
||||
mulx $car1,$mul1,$car1
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tmp0,$car1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
ld [$np+8],$npj ! np[2]
|
||||
srlx $car1,32,$car1
|
||||
add $tmp1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$car1,$car1
|
||||
and $car0,1,$sbit
|
||||
add $acc1,$car1,$car1
|
||||
srlx $car0,1,$car0
|
||||
mov 12,$j
|
||||
st $car1,[%sp+$bias+$frame] ! tp[0]=
|
||||
srlx $car1,32,$car1
|
||||
add %sp,$bias+$frame+4,$tp
|
||||
|
||||
.Lsqr_2nd:
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $acc0,$car0,$car0
|
||||
add $tpj,$car1,$car1
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
and $car0,$mask,$acc0
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
srlx $car0,32,$car0
|
||||
add $acc1,$car1,$car1
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
add $acc0,$acc0,$acc0
|
||||
add $j,4,$j ! j++
|
||||
or $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
cmp $j,$num
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.Lsqr_2nd
|
||||
add $tp,4,$tp ! tp++
|
||||
!.Lsqr_2nd
|
||||
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $acc0,$car0,$car0
|
||||
add $tpj,$car1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
srlx $car0,32,$car0
|
||||
add $acc1,$car1,$car1
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
add $acc0,$car1,$car1
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0
|
||||
or $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
srlx $car1,32,$car2
|
||||
|
||||
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
|
||||
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
|
||||
ld [$ap+8],$mul0 ! ap[2]
|
||||
ld [$np],$car1 ! np[0]
|
||||
ld [$np+4],$npj ! np[1]
|
||||
mulx $n0,$tmp1,$mul1
|
||||
and $mul1,$mask,$mul1
|
||||
mov 8,$i
|
||||
|
||||
mulx $mul0,$mul0,$car0
|
||||
mulx $car1,$mul1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
add $tmp1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add %sp,$bias+$frame,$tp
|
||||
srlx $car1,32,$car1
|
||||
and $car0,1,$sbit
|
||||
srlx $car0,1,$car0
|
||||
mov 4,$j
|
||||
|
||||
.Lsqr_outer:
|
||||
.Lsqr_inner1:
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $j,4,$j
|
||||
ld [$tp+8],$tpj
|
||||
cmp $j,$i
|
||||
add $acc1,$car1,$car1
|
||||
ld [$np+$j],$npj
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.Lsqr_inner1
|
||||
add $tp,4,$tp
|
||||
!.Lsqr_inner1
|
||||
|
||||
add $j,4,$j
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
add $acc0,$car1,$car1
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
add $acc1,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $j,4,$j
|
||||
cmp $j,$num
|
||||
be,pn %icc,.Lsqr_no_inner2
|
||||
add $tp,4,$tp
|
||||
|
||||
.Lsqr_inner2:
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $acc0,$car0,$car0
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
and $car0,$mask,$acc0
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
or $sbit,$acc0,$acc0
|
||||
add $j,4,$j ! j++
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
cmp $j,$num
|
||||
add $acc0,$car1,$car1
|
||||
add $acc1,$car1,$car1
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.Lsqr_inner2
|
||||
add $tp,4,$tp ! tp++
|
||||
|
||||
.Lsqr_no_inner2:
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $acc0,$car0,$car0
|
||||
and $car0,$mask,$acc0
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
add $acc0,$car1,$car1
|
||||
add $acc1,$car1,$car1
|
||||
st $car1,[$tp] ! tp[j-1]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0
|
||||
or $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
srlx $car1,32,$car2
|
||||
|
||||
add $i,4,$i ! i++
|
||||
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
|
||||
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
|
||||
ld [$ap+$i],$mul0 ! ap[j]
|
||||
ld [$np],$car1 ! np[0]
|
||||
ld [$np+4],$npj ! np[1]
|
||||
mulx $n0,$tmp1,$mul1
|
||||
and $mul1,$mask,$mul1
|
||||
add $i,4,$tmp0
|
||||
|
||||
mulx $mul0,$mul0,$car0
|
||||
mulx $car1,$mul1,$car1
|
||||
and $car0,$mask,$acc0
|
||||
add $tmp1,$car1,$car1
|
||||
srlx $car0,32,$car0
|
||||
add %sp,$bias+$frame,$tp
|
||||
srlx $car1,32,$car1
|
||||
and $car0,1,$sbit
|
||||
srlx $car0,1,$car0
|
||||
|
||||
cmp $tmp0,$num ! i<num-1
|
||||
bl %icc,.Lsqr_outer
|
||||
mov 4,$j
|
||||
|
||||
.Lsqr_last:
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $j,4,$j
|
||||
ld [$tp+8],$tpj
|
||||
cmp $j,$i
|
||||
add $acc1,$car1,$car1
|
||||
ld [$np+$j],$npj
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
bl %icc,.Lsqr_last
|
||||
add $tp,4,$tp
|
||||
!.Lsqr_last
|
||||
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $acc0,$car1,$car1
|
||||
add $acc1,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0 ! recover $car0
|
||||
or $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
srlx $car1,32,$car2
|
||||
|
||||
ba .Ltail
|
||||
add $tp,8,$tp
|
||||
.type $fname,#function
|
||||
.size $fname,(.-$fname)
|
||||
.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 32
|
||||
___
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
||||
882
openssl-1.0.2f/crypto/bn/asm/sparcv9a-mont.pl
Executable file
882
openssl-1.0.2f/crypto/bn/asm/sparcv9a-mont.pl
Executable file
@@ -0,0 +1,882 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# October 2005
|
||||
#
|
||||
# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
|
||||
# Because unlike integer multiplier, which simply stalls whole CPU,
|
||||
# FPU is fully pipelined and can effectively emit 48 bit partial
|
||||
# product every cycle. Why not blended SPARC v9? One can argue that
|
||||
# making this module dependent on UltraSPARC VIS extension limits its
|
||||
# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
|
||||
# implementations from compatibility matrix. But the rest, whole Sun
|
||||
# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
|
||||
# VIS extension instructions used in this module. This is considered
|
||||
# good enough to not care about HAL SPARC64 users [if any] who have
|
||||
# integer-only pure SPARCv9 module to "fall down" to.
|
||||
|
||||
# USI&II cores currently exhibit uniform 2x improvement [over pre-
|
||||
# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
|
||||
# performance improves few percents for shorter keys and worsens few
|
||||
# percents for longer keys. This is because USIII integer multiplier
|
||||
# is >3x faster than USI&II one, which is harder to match [but see
|
||||
# TODO list below]. It should also be noted that SPARC64 V features
|
||||
# out-of-order execution, which *might* mean that integer multiplier
|
||||
# is pipelined, which in turn *might* be impossible to match... On
|
||||
# additional note, SPARC64 V implements FP Multiply-Add instruction,
|
||||
# which is perfectly usable in this context... In other words, as far
|
||||
# as Fujitsu SPARC64 V goes, talk to the author:-)
|
||||
|
||||
# The implementation implies following "non-natural" limitations on
|
||||
# input arguments:
|
||||
# - num may not be less than 4;
|
||||
# - num has to be even;
|
||||
# Failure to meet either condition has no fatal effects, simply
|
||||
# doesn't give any performance gain.
|
||||
|
||||
# TODO:
|
||||
# - modulo-schedule inner loop for better performance (on in-order
|
||||
# execution core such as UltraSPARC this shall result in further
|
||||
# noticeable(!) improvement);
|
||||
# - dedicated squaring procedure[?];
|
||||
|
||||
######################################################################
|
||||
# November 2006
|
||||
#
|
||||
# Modulo-scheduled inner loops allow to interleave floating point and
|
||||
# integer instructions and minimize Read-After-Write penalties. This
|
||||
# results in *further* 20-50% perfromance improvement [depending on
|
||||
# key length, more for longer keys] on USI&II cores and 30-80% - on
|
||||
# USIII&IV.
|
||||
|
||||
$fname="bn_mul_mont_fpu";
|
||||
$bits=32;
|
||||
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
|
||||
|
||||
if ($bits==64) {
|
||||
$bias=2047;
|
||||
$frame=192;
|
||||
} else {
|
||||
$bias=0;
|
||||
$frame=128; # 96 rounded up to largest known cache-line
|
||||
}
|
||||
$locals=64;
|
||||
|
||||
# In order to provide for 32-/64-bit ABI duality, I keep integers wider
|
||||
# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
|
||||
# exclusively for pointers, indexes and other small values...
|
||||
# int bn_mul_mont(
|
||||
$rp="%i0"; # BN_ULONG *rp,
|
||||
$ap="%i1"; # const BN_ULONG *ap,
|
||||
$bp="%i2"; # const BN_ULONG *bp,
|
||||
$np="%i3"; # const BN_ULONG *np,
|
||||
$n0="%i4"; # const BN_ULONG *n0,
|
||||
$num="%i5"; # int num);
|
||||
|
||||
$tp="%l0"; # t[num]
|
||||
$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
|
||||
$ap_h="%l2"; # to these four vectors as double-precision FP values.
|
||||
$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
|
||||
$np_h="%l4"; # loop and L1-cache aliasing is minimized...
|
||||
$i="%l5";
|
||||
$j="%l6";
|
||||
$mask="%l7"; # 16-bit mask, 0xffff
|
||||
|
||||
$n0="%g4"; # reassigned(!) to "64-bit" register
|
||||
$carry="%i4"; # %i4 reused(!) for a carry bit
|
||||
|
||||
# FP register naming chart
|
||||
#
|
||||
# ..HILO
|
||||
# dcba
|
||||
# --------
|
||||
# LOa
|
||||
# LOb
|
||||
# LOc
|
||||
# LOd
|
||||
# HIa
|
||||
# HIb
|
||||
# HIc
|
||||
# HId
|
||||
# ..a
|
||||
# ..b
|
||||
$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
|
||||
$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
|
||||
$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
|
||||
$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
|
||||
|
||||
$dota="%f24"; $dotb="%f26";
|
||||
|
||||
$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
|
||||
$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
|
||||
$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
|
||||
$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
|
||||
|
||||
$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
|
||||
|
||||
$code=<<___;
|
||||
.section ".text",#alloc,#execinstr
|
||||
|
||||
.global $fname
|
||||
.align 32
|
||||
$fname:
|
||||
save %sp,-$frame-$locals,%sp
|
||||
|
||||
cmp $num,4
|
||||
bl,a,pn %icc,.Lret
|
||||
clr %i0
|
||||
andcc $num,1,%g0 ! $num has to be even...
|
||||
bnz,a,pn %icc,.Lret
|
||||
clr %i0 ! signal "unsupported input value"
|
||||
|
||||
srl $num,1,$num
|
||||
sethi %hi(0xffff),$mask
|
||||
ld [%i4+0],$n0 ! $n0 reassigned, remember?
|
||||
or $mask,%lo(0xffff),$mask
|
||||
ld [%i4+4],%o0
|
||||
sllx %o0,32,%o0
|
||||
or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
|
||||
|
||||
sll $num,3,$num ! num*=8
|
||||
|
||||
add %sp,$bias,%o0 ! real top of stack
|
||||
sll $num,2,%o1
|
||||
add %o1,$num,%o1 ! %o1=num*5
|
||||
sub %o0,%o1,%o0
|
||||
and %o0,-2048,%o0 ! optimize TLB utilization
|
||||
sub %o0,$bias,%sp ! alloca(5*num*8)
|
||||
|
||||
rd %asi,%o7 ! save %asi
|
||||
add %sp,$bias+$frame+$locals,$tp
|
||||
add $tp,$num,$ap_l
|
||||
add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
|
||||
add $ap_l,$num,$ap_h
|
||||
add $ap_h,$num,$np_l
|
||||
add $np_l,$num,$np_h
|
||||
|
||||
wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
|
||||
|
||||
add $rp,$num,$rp ! readjust input pointers to point
|
||||
add $ap,$num,$ap ! at the ends too...
|
||||
add $bp,$num,$bp
|
||||
add $np,$num,$np
|
||||
|
||||
stx %o7,[%sp+$bias+$frame+48] ! save %asi
|
||||
|
||||
sub %g0,$num,$i ! i=-num
|
||||
sub %g0,$num,$j ! j=-num
|
||||
|
||||
add $ap,$j,%o3
|
||||
add $bp,$i,%o4
|
||||
|
||||
ld [%o3+4],%g1 ! bp[0]
|
||||
ld [%o3+0],%o0
|
||||
ld [%o4+4],%g5 ! ap[0]
|
||||
sllx %g1,32,%g1
|
||||
ld [%o4+0],%o1
|
||||
sllx %g5,32,%g5
|
||||
or %g1,%o0,%o0
|
||||
or %g5,%o1,%o1
|
||||
|
||||
add $np,$j,%o5
|
||||
|
||||
mulx %o1,%o0,%o0 ! ap[0]*bp[0]
|
||||
mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
|
||||
stx %o0,[%sp+$bias+$frame+0]
|
||||
|
||||
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
|
||||
fzeros $alo
|
||||
ld [%o3+4],$ahi_
|
||||
fzeros $ahi
|
||||
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
||||
fzeros $nlo
|
||||
ld [%o5+4],$nhi_
|
||||
fzeros $nhi
|
||||
|
||||
! transfer b[i] to FPU as 4x16-bit values
|
||||
ldda [%o4+2]%asi,$ba
|
||||
fxtod $alo,$alo
|
||||
ldda [%o4+0]%asi,$bb
|
||||
fxtod $ahi,$ahi
|
||||
ldda [%o4+6]%asi,$bc
|
||||
fxtod $nlo,$nlo
|
||||
ldda [%o4+4]%asi,$bd
|
||||
fxtod $nhi,$nhi
|
||||
|
||||
! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
|
||||
ldda [%sp+$bias+$frame+6]%asi,$na
|
||||
fxtod $ba,$ba
|
||||
ldda [%sp+$bias+$frame+4]%asi,$nb
|
||||
fxtod $bb,$bb
|
||||
ldda [%sp+$bias+$frame+2]%asi,$nc
|
||||
fxtod $bc,$bc
|
||||
ldda [%sp+$bias+$frame+0]%asi,$nd
|
||||
fxtod $bd,$bd
|
||||
|
||||
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
|
||||
fxtod $na,$na
|
||||
std $ahi,[$ap_h+$j]
|
||||
fxtod $nb,$nb
|
||||
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
|
||||
fxtod $nc,$nc
|
||||
std $nhi,[$np_h+$j]
|
||||
fxtod $nd,$nd
|
||||
|
||||
fmuld $alo,$ba,$aloa
|
||||
fmuld $nlo,$na,$nloa
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
fmuld $alo,$bd,$alod
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
fmuld $ahi,$ba,$ahia
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
fmuld $ahi,$bb,$ahib
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
fmuld $ahi,$bc,$ahic
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
fmuld $ahi,$bd,$ahid
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
|
||||
faddd $nloc,$nhia,$nloc
|
||||
faddd $nlod,$nhib,$nlod
|
||||
|
||||
fdtox $nloa,$nloa
|
||||
fdtox $nlob,$nlob
|
||||
fdtox $nloc,$nloc
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
add $j,8,$j
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
add $ap,$j,%o4
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
add $np,$j,%o5
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
|
||||
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
|
||||
fzeros $alo
|
||||
ld [%o4+4],$ahi_
|
||||
fzeros $ahi
|
||||
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
||||
fzeros $nlo
|
||||
ld [%o5+4],$nhi_
|
||||
fzeros $nhi
|
||||
|
||||
fxtod $alo,$alo
|
||||
fxtod $ahi,$ahi
|
||||
fxtod $nlo,$nlo
|
||||
fxtod $nhi,$nhi
|
||||
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
fmuld $alo,$ba,$aloa
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
fmuld $nlo,$na,$nloa
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
fmuld $alo,$bb,$alob
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
fmuld $nlo,$nb,$nlob
|
||||
|
||||
srlx %o0,16,%o7
|
||||
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
|
||||
fmuld $alo,$bc,$aloc
|
||||
add %o7,%o1,%o1
|
||||
std $ahi,[$ap_h+$j]
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
srlx %o1,16,%o7
|
||||
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
|
||||
fmuld $alo,$bd,$alod
|
||||
add %o7,%o2,%o2
|
||||
std $nhi,[$np_h+$j]
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
srlx %o2,16,%o7
|
||||
fmuld $ahi,$ba,$ahia
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
!and %o0,$mask,%o0
|
||||
!and %o1,$mask,%o1
|
||||
!and %o2,$mask,%o2
|
||||
!sllx %o1,16,%o1
|
||||
!sllx %o2,32,%o2
|
||||
!sllx %o3,48,%o7
|
||||
!or %o1,%o0,%o0
|
||||
!or %o2,%o0,%o0
|
||||
!or %o7,%o0,%o0 ! 64-bit result
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
fmuld $ahi,$bb,$ahib
|
||||
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
fmuld $ahi,$bc,$ahic
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
fmuld $ahi,$bd,$ahid
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
|
||||
faddd $dota,$nloa,$nloa
|
||||
faddd $dotb,$nlob,$nlob
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
|
||||
faddd $nloc,$nhia,$nloc
|
||||
faddd $nlod,$nhib,$nlod
|
||||
|
||||
fdtox $nloa,$nloa
|
||||
fdtox $nlob,$nlob
|
||||
fdtox $nloc,$nloc
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
addcc $j,8,$j
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
bz,pn %icc,.L1stskip
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
|
||||
.align 32 ! incidentally already aligned !
|
||||
.L1st:
|
||||
add $ap,$j,%o4
|
||||
add $np,$j,%o5
|
||||
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
|
||||
fzeros $alo
|
||||
ld [%o4+4],$ahi_
|
||||
fzeros $ahi
|
||||
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
||||
fzeros $nlo
|
||||
ld [%o5+4],$nhi_
|
||||
fzeros $nhi
|
||||
|
||||
fxtod $alo,$alo
|
||||
fxtod $ahi,$ahi
|
||||
fxtod $nlo,$nlo
|
||||
fxtod $nhi,$nhi
|
||||
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
fmuld $alo,$ba,$aloa
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
fmuld $nlo,$na,$nloa
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
fmuld $alo,$bb,$alob
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
fmuld $nlo,$nb,$nlob
|
||||
|
||||
srlx %o0,16,%o7
|
||||
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
|
||||
fmuld $alo,$bc,$aloc
|
||||
add %o7,%o1,%o1
|
||||
std $ahi,[$ap_h+$j]
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
srlx %o1,16,%o7
|
||||
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
|
||||
fmuld $alo,$bd,$alod
|
||||
add %o7,%o2,%o2
|
||||
std $nhi,[$np_h+$j]
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
srlx %o2,16,%o7
|
||||
fmuld $ahi,$ba,$ahia
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
and %o0,$mask,%o0
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
and %o1,$mask,%o1
|
||||
and %o2,$mask,%o2
|
||||
fmuld $ahi,$bb,$ahib
|
||||
sllx %o1,16,%o1
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
sllx %o2,32,%o2
|
||||
fmuld $ahi,$bc,$ahic
|
||||
sllx %o3,48,%o7
|
||||
or %o1,%o0,%o0
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
or %o2,%o0,%o0
|
||||
fmuld $ahi,$bd,$ahid
|
||||
or %o7,%o0,%o0 ! 64-bit result
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
addcc %g1,%o0,%o0
|
||||
faddd $dota,$nloa,$nloa
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
faddd $dotb,$nlob,$nlob
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
stx %o0,[$tp] ! tp[j-1]=
|
||||
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
|
||||
faddd $nloc,$nhia,$nloc
|
||||
faddd $nlod,$nhib,$nlod
|
||||
|
||||
fdtox $nloa,$nloa
|
||||
fdtox $nlob,$nlob
|
||||
fdtox $nloc,$nloc
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
|
||||
addcc $j,8,$j
|
||||
bnz,pt %icc,.L1st
|
||||
add $tp,8,$tp
|
||||
|
||||
.L1stskip:
|
||||
fdtox $dota,$dota
|
||||
fdtox $dotb,$dotb
|
||||
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
|
||||
srlx %o0,16,%o7
|
||||
std $dota,[%sp+$bias+$frame+32]
|
||||
add %o7,%o1,%o1
|
||||
std $dotb,[%sp+$bias+$frame+40]
|
||||
srlx %o1,16,%o7
|
||||
add %o7,%o2,%o2
|
||||
srlx %o2,16,%o7
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
and %o0,$mask,%o0
|
||||
and %o1,$mask,%o1
|
||||
and %o2,$mask,%o2
|
||||
sllx %o1,16,%o1
|
||||
sllx %o2,32,%o2
|
||||
sllx %o3,48,%o7
|
||||
or %o1,%o0,%o0
|
||||
or %o2,%o0,%o0
|
||||
or %o7,%o0,%o0 ! 64-bit result
|
||||
ldx [%sp+$bias+$frame+32],%o4
|
||||
addcc %g1,%o0,%o0
|
||||
ldx [%sp+$bias+$frame+40],%o5
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
stx %o0,[$tp] ! tp[j-1]=
|
||||
add $tp,8,$tp
|
||||
|
||||
srlx %o4,16,%o7
|
||||
add %o7,%o5,%o5
|
||||
and %o4,$mask,%o4
|
||||
sllx %o5,16,%o7
|
||||
or %o7,%o4,%o4
|
||||
addcc %g1,%o4,%o4
|
||||
srlx %o5,48,%g1
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
mov %g1,$carry
|
||||
stx %o4,[$tp] ! tp[num-1]=
|
||||
|
||||
ba .Louter
|
||||
add $i,8,$i
|
||||
.align 32
|
||||
.Louter:
|
||||
sub %g0,$num,$j ! j=-num
|
||||
add %sp,$bias+$frame+$locals,$tp
|
||||
|
||||
add $ap,$j,%o3
|
||||
add $bp,$i,%o4
|
||||
|
||||
ld [%o3+4],%g1 ! bp[i]
|
||||
ld [%o3+0],%o0
|
||||
ld [%o4+4],%g5 ! ap[0]
|
||||
sllx %g1,32,%g1
|
||||
ld [%o4+0],%o1
|
||||
sllx %g5,32,%g5
|
||||
or %g1,%o0,%o0
|
||||
or %g5,%o1,%o1
|
||||
|
||||
ldx [$tp],%o2 ! tp[0]
|
||||
mulx %o1,%o0,%o0
|
||||
addcc %o2,%o0,%o0
|
||||
mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
|
||||
stx %o0,[%sp+$bias+$frame+0]
|
||||
|
||||
! transfer b[i] to FPU as 4x16-bit values
|
||||
ldda [%o4+2]%asi,$ba
|
||||
ldda [%o4+0]%asi,$bb
|
||||
ldda [%o4+6]%asi,$bc
|
||||
ldda [%o4+4]%asi,$bd
|
||||
|
||||
! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
|
||||
ldda [%sp+$bias+$frame+6]%asi,$na
|
||||
fxtod $ba,$ba
|
||||
ldda [%sp+$bias+$frame+4]%asi,$nb
|
||||
fxtod $bb,$bb
|
||||
ldda [%sp+$bias+$frame+2]%asi,$nc
|
||||
fxtod $bc,$bc
|
||||
ldda [%sp+$bias+$frame+0]%asi,$nd
|
||||
fxtod $bd,$bd
|
||||
ldd [$ap_l+$j],$alo ! load a[j] in double format
|
||||
fxtod $na,$na
|
||||
ldd [$ap_h+$j],$ahi
|
||||
fxtod $nb,$nb
|
||||
ldd [$np_l+$j],$nlo ! load n[j] in double format
|
||||
fxtod $nc,$nc
|
||||
ldd [$np_h+$j],$nhi
|
||||
fxtod $nd,$nd
|
||||
|
||||
fmuld $alo,$ba,$aloa
|
||||
fmuld $nlo,$na,$nloa
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
fmuld $alo,$bd,$alod
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
fmuld $ahi,$ba,$ahia
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
fmuld $ahi,$bb,$ahib
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
fmuld $ahi,$bc,$ahic
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
fmuld $ahi,$bd,$ahid
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
|
||||
faddd $nloc,$nhia,$nloc
|
||||
faddd $nlod,$nhib,$nlod
|
||||
|
||||
fdtox $nloa,$nloa
|
||||
fdtox $nlob,$nlob
|
||||
fdtox $nloc,$nloc
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
add $j,8,$j
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
|
||||
ldd [$ap_l+$j],$alo ! load a[j] in double format
|
||||
ldd [$ap_h+$j],$ahi
|
||||
ldd [$np_l+$j],$nlo ! load n[j] in double format
|
||||
ldd [$np_h+$j],$nhi
|
||||
|
||||
fmuld $alo,$ba,$aloa
|
||||
fmuld $nlo,$na,$nloa
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
fmuld $alo,$bd,$alod
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
fmuld $ahi,$ba,$ahia
|
||||
|
||||
srlx %o0,16,%o7
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
add %o7,%o1,%o1
|
||||
fmuld $ahi,$bb,$ahib
|
||||
srlx %o1,16,%o7
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
add %o7,%o2,%o2
|
||||
fmuld $ahi,$bc,$ahic
|
||||
srlx %o2,16,%o7
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
! why?
|
||||
and %o0,$mask,%o0
|
||||
fmuld $ahi,$bd,$ahid
|
||||
and %o1,$mask,%o1
|
||||
and %o2,$mask,%o2
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
sllx %o1,16,%o1
|
||||
faddd $dota,$nloa,$nloa
|
||||
sllx %o2,32,%o2
|
||||
faddd $dotb,$nlob,$nlob
|
||||
sllx %o3,48,%o7
|
||||
or %o1,%o0,%o0
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
or %o2,%o0,%o0
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
or %o7,%o0,%o0 ! 64-bit result
|
||||
ldx [$tp],%o7
|
||||
faddd $nloc,$nhia,$nloc
|
||||
addcc %o7,%o0,%o0
|
||||
! end-of-why?
|
||||
faddd $nlod,$nhib,$nlod
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
fdtox $nloa,$nloa
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
fdtox $nlob,$nlob
|
||||
fdtox $nloc,$nloc
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
addcc $j,8,$j
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
bz,pn %icc,.Linnerskip
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
|
||||
ba .Linner
|
||||
nop
|
||||
.align 32
|
||||
.Linner:
|
||||
ldd [$ap_l+$j],$alo ! load a[j] in double format
|
||||
ldd [$ap_h+$j],$ahi
|
||||
ldd [$np_l+$j],$nlo ! load n[j] in double format
|
||||
ldd [$np_h+$j],$nhi
|
||||
|
||||
fmuld $alo,$ba,$aloa
|
||||
fmuld $nlo,$na,$nloa
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
fmuld $alo,$bd,$alod
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
fmuld $ahi,$ba,$ahia
|
||||
|
||||
srlx %o0,16,%o7
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
add %o7,%o1,%o1
|
||||
fmuld $ahi,$bb,$ahib
|
||||
srlx %o1,16,%o7
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
add %o7,%o2,%o2
|
||||
fmuld $ahi,$bc,$ahic
|
||||
srlx %o2,16,%o7
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
and %o0,$mask,%o0
|
||||
fmuld $ahi,$bd,$ahid
|
||||
and %o1,$mask,%o1
|
||||
and %o2,$mask,%o2
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
sllx %o1,16,%o1
|
||||
faddd $dota,$nloa,$nloa
|
||||
sllx %o2,32,%o2
|
||||
faddd $dotb,$nlob,$nlob
|
||||
sllx %o3,48,%o7
|
||||
or %o1,%o0,%o0
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
or %o2,%o0,%o0
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
or %o7,%o0,%o0 ! 64-bit result
|
||||
faddd $nloc,$nhia,$nloc
|
||||
addcc %g1,%o0,%o0
|
||||
ldx [$tp+8],%o7 ! tp[j]
|
||||
faddd $nlod,$nhib,$nlod
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
fdtox $nloa,$nloa
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
fdtox $nlob,$nlob
|
||||
addcc %o7,%o0,%o0
|
||||
fdtox $nloc,$nloc
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
stx %o0,[$tp] ! tp[j-1]
|
||||
fdtox $nlod,$nlod
|
||||
|
||||
std $nloa,[%sp+$bias+$frame+0]
|
||||
std $nlob,[%sp+$bias+$frame+8]
|
||||
std $nloc,[%sp+$bias+$frame+16]
|
||||
addcc $j,8,$j
|
||||
std $nlod,[%sp+$bias+$frame+24]
|
||||
bnz,pt %icc,.Linner
|
||||
add $tp,8,$tp
|
||||
|
||||
.Linnerskip:
|
||||
fdtox $dota,$dota
|
||||
fdtox $dotb,$dotb
|
||||
|
||||
ldx [%sp+$bias+$frame+0],%o0
|
||||
ldx [%sp+$bias+$frame+8],%o1
|
||||
ldx [%sp+$bias+$frame+16],%o2
|
||||
ldx [%sp+$bias+$frame+24],%o3
|
||||
|
||||
srlx %o0,16,%o7
|
||||
std $dota,[%sp+$bias+$frame+32]
|
||||
add %o7,%o1,%o1
|
||||
std $dotb,[%sp+$bias+$frame+40]
|
||||
srlx %o1,16,%o7
|
||||
add %o7,%o2,%o2
|
||||
srlx %o2,16,%o7
|
||||
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||
and %o0,$mask,%o0
|
||||
and %o1,$mask,%o1
|
||||
and %o2,$mask,%o2
|
||||
sllx %o1,16,%o1
|
||||
sllx %o2,32,%o2
|
||||
sllx %o3,48,%o7
|
||||
or %o1,%o0,%o0
|
||||
or %o2,%o0,%o0
|
||||
ldx [%sp+$bias+$frame+32],%o4
|
||||
or %o7,%o0,%o0 ! 64-bit result
|
||||
ldx [%sp+$bias+$frame+40],%o5
|
||||
addcc %g1,%o0,%o0
|
||||
ldx [$tp+8],%o7 ! tp[j]
|
||||
srlx %o3,16,%g1 ! 34-bit carry
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
addcc %o7,%o0,%o0
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
stx %o0,[$tp] ! tp[j-1]
|
||||
add $tp,8,$tp
|
||||
|
||||
srlx %o4,16,%o7
|
||||
add %o7,%o5,%o5
|
||||
and %o4,$mask,%o4
|
||||
sllx %o5,16,%o7
|
||||
or %o7,%o4,%o4
|
||||
addcc %g1,%o4,%o4
|
||||
srlx %o5,48,%g1
|
||||
bcs,a %xcc,.+8
|
||||
add %g1,1,%g1
|
||||
|
||||
addcc $carry,%o4,%o4
|
||||
stx %o4,[$tp] ! tp[num-1]
|
||||
mov %g1,$carry
|
||||
bcs,a %xcc,.+8
|
||||
add $carry,1,$carry
|
||||
|
||||
addcc $i,8,$i
|
||||
bnz %icc,.Louter
|
||||
nop
|
||||
|
||||
add $tp,8,$tp ! adjust tp to point at the end
|
||||
orn %g0,%g0,%g4
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
ba .Lsub
|
||||
subcc %g0,%g0,%g0 ! clear %icc.c
|
||||
|
||||
.align 32
|
||||
.Lsub:
|
||||
ldx [$tp+%o7],%o0
|
||||
add $np,%o7,%g1
|
||||
ld [%g1+0],%o2
|
||||
ld [%g1+4],%o3
|
||||
srlx %o0,32,%o1
|
||||
subccc %o0,%o2,%o2
|
||||
add $rp,%o7,%g1
|
||||
subccc %o1,%o3,%o3
|
||||
st %o2,[%g1+0]
|
||||
add %o7,8,%o7
|
||||
brnz,pt %o7,.Lsub
|
||||
st %o3,[%g1+4]
|
||||
subc $carry,0,%g4
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
ba .Lcopy
|
||||
nop
|
||||
|
||||
.align 32
|
||||
.Lcopy:
|
||||
ldx [$tp+%o7],%o0
|
||||
add $rp,%o7,%g1
|
||||
ld [%g1+0],%o2
|
||||
ld [%g1+4],%o3
|
||||
stx %g0,[$tp+%o7]
|
||||
and %o0,%g4,%o0
|
||||
srlx %o0,32,%o1
|
||||
andn %o2,%g4,%o2
|
||||
andn %o3,%g4,%o3
|
||||
or %o2,%o0,%o0
|
||||
or %o3,%o1,%o1
|
||||
st %o0,[%g1+0]
|
||||
add %o7,8,%o7
|
||||
brnz,pt %o7,.Lcopy
|
||||
st %o1,[%g1+4]
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
|
||||
.Lzap:
|
||||
stx %g0,[$ap_l+%o7]
|
||||
stx %g0,[$ap_h+%o7]
|
||||
stx %g0,[$np_l+%o7]
|
||||
stx %g0,[$np_h+%o7]
|
||||
add %o7,8,%o7
|
||||
brnz,pt %o7,.Lzap
|
||||
nop
|
||||
|
||||
ldx [%sp+$bias+$frame+48],%o7
|
||||
wr %g0,%o7,%asi ! restore %asi
|
||||
|
||||
mov 1,%i0
|
||||
.Lret:
|
||||
ret
|
||||
restore
|
||||
.type $fname,#function
|
||||
.size $fname,(.-$fname)
|
||||
.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 32
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
|
||||
# Below substitution makes it possible to compile without demanding
|
||||
# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
|
||||
# dare to do this, because VIS capability is detected at run-time now
|
||||
# and this routine is not called on CPU not capable to execute it. Do
|
||||
# note that fzeros is not the only VIS dependency! Another dependency
|
||||
# is implicit and is just _a_ numerical value loaded to %asi register,
|
||||
# which assembler can't recognize as VIS specific...
|
||||
$code =~ s/fzeros\s+%f([0-9]+)/
|
||||
sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
|
||||
/gem;
|
||||
|
||||
print $code;
|
||||
# flush
|
||||
close STDOUT;
|
||||
242
openssl-1.0.2f/crypto/bn/asm/via-mont.pl
Normal file
242
openssl-1.0.2f/crypto/bn/asm/via-mont.pl
Normal file
@@ -0,0 +1,242 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# Wrapper around 'rep montmul', VIA-specific instruction accessing
|
||||
# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
|
||||
# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
|
||||
#
|
||||
# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
|
||||
# different software configurations on 1.5GHz VIA Esther processor.
|
||||
# Lines marked with "software integer" denote performance of hand-
|
||||
# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
|
||||
# refers to hand-coded SSE2 Montgomery multiplication procedure found
|
||||
# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
|
||||
# Padlock SDK 2.0.1 available for download from VIA, which naturally
|
||||
# utilizes the magic 'repz montmul' instruction. And finally "hardware
|
||||
# this" refers to *this* implementation which also uses 'repz montmul'
|
||||
#
|
||||
# sign verify sign/s verify/s
|
||||
# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
|
||||
# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
|
||||
# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
|
||||
# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
|
||||
#
|
||||
# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
|
||||
# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
|
||||
# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
|
||||
# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
|
||||
#
|
||||
# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
|
||||
# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
|
||||
# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
|
||||
# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
|
||||
#
|
||||
# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
|
||||
# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
|
||||
# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
|
||||
# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
|
||||
#
|
||||
# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
|
||||
# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
|
||||
# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
|
||||
# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
|
||||
#
|
||||
# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
|
||||
# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
|
||||
# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
|
||||
# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
|
||||
#
|
||||
# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
|
||||
# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
|
||||
# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
|
||||
# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
|
||||
#
|
||||
# To give you some other reference point here is output for 2.4GHz P4
|
||||
# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
|
||||
# SSE2" in above terms.
|
||||
#
|
||||
# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
|
||||
# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
|
||||
# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
|
||||
# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
|
||||
# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
|
||||
# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
|
||||
# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
|
||||
#
|
||||
# Conclusions:
|
||||
# - VIA SDK leaves a *lot* of room for improvement (which this
|
||||
# implementation successfully fills:-);
|
||||
# - 'rep montmul' gives up to >3x performance improvement depending on
|
||||
# key length;
|
||||
# - in terms of absolute performance it delivers approximately as much
|
||||
# as modern out-of-order 32-bit cores [again, for longer keys].
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],"via-mont.pl");
|
||||
|
||||
# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
|
||||
$func="bn_mul_mont_padlock";
|
||||
|
||||
$pad=16*1; # amount of reserved bytes on top of every vector
|
||||
|
||||
# stack layout
|
||||
$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
|
||||
$A=&DWP(4,"esp");
|
||||
$B=&DWP(8,"esp");
|
||||
$T=&DWP(12,"esp");
|
||||
$M=&DWP(16,"esp");
|
||||
$scratch=&DWP(20,"esp");
|
||||
$rp=&DWP(24,"esp"); # these are mine
|
||||
$sp=&DWP(28,"esp");
|
||||
# &DWP(32,"esp") # 32 byte scratch area
|
||||
# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
|
||||
# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
|
||||
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
|
||||
# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
|
||||
# Note that SDK suggests to unconditionally allocate 2K per vector. This
|
||||
# has quite an impact on performance. It naturally depends on key length,
|
||||
# but to give an example 1024 bit private RSA key operations suffer >30%
|
||||
# penalty. I allocate only as much as actually required...
|
||||
|
||||
&function_begin($func);
|
||||
&xor ("eax","eax");
|
||||
&mov ("ecx",&wparam(5)); # num
|
||||
# meet VIA's limitations for num [note that the specification
|
||||
# expresses them in bits, while we work with amount of 32-bit words]
|
||||
&test ("ecx",3);
|
||||
&jnz (&label("leave")); # num % 4 != 0
|
||||
&cmp ("ecx",8);
|
||||
&jb (&label("leave")); # num < 8
|
||||
&cmp ("ecx",1024);
|
||||
&ja (&label("leave")); # num > 1024
|
||||
|
||||
&pushf ();
|
||||
&cld ();
|
||||
|
||||
&mov ("edi",&wparam(0)); # rp
|
||||
&mov ("eax",&wparam(1)); # ap
|
||||
&mov ("ebx",&wparam(2)); # bp
|
||||
&mov ("edx",&wparam(3)); # np
|
||||
&mov ("esi",&wparam(4)); # n0
|
||||
&mov ("esi",&DWP(0,"esi")); # *n0
|
||||
|
||||
&lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
|
||||
&lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
|
||||
&neg ("ebp");
|
||||
&add ("ebp","esp");
|
||||
&and ("ebp",-64); # align to cache-line
|
||||
&xchg ("ebp","esp"); # alloca
|
||||
|
||||
&mov ($rp,"edi"); # save rp
|
||||
&mov ($sp,"ebp"); # save esp
|
||||
|
||||
&mov ($mZeroPrime,"esi");
|
||||
&lea ("esi",&DWP(64,"esp")); # tp
|
||||
&mov ($T,"esi");
|
||||
&lea ("edi",&DWP(32,"esp")); # scratch area
|
||||
&mov ($scratch,"edi");
|
||||
&mov ("esi","eax");
|
||||
|
||||
&lea ("ebp",&DWP(-$pad,"ecx"));
|
||||
&shr ("ebp",2); # restore original num value in ebp
|
||||
|
||||
&xor ("eax","eax");
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
|
||||
&mov ($A,"edi");
|
||||
&data_byte(0xf3,0xa5); # rep movsl, memcpy
|
||||
&mov ("ecx",$pad/4);
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero pad
|
||||
# edi points at the end of padded ap copy...
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&mov ("esi","ebx");
|
||||
&mov ($B,"edi");
|
||||
&data_byte(0xf3,0xa5); # rep movsl, memcpy
|
||||
&mov ("ecx",$pad/4);
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero pad
|
||||
# edi points at the end of padded bp copy...
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&mov ("esi","edx");
|
||||
&mov ($M,"edi");
|
||||
&data_byte(0xf3,0xa5); # rep movsl, memcpy
|
||||
&mov ("ecx",$pad/4);
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero pad
|
||||
# edi points at the end of padded np copy...
|
||||
|
||||
# let magic happen...
|
||||
&mov ("ecx","ebp");
|
||||
&mov ("esi","esp");
|
||||
&shl ("ecx",5); # convert word counter to bit counter
|
||||
&align (4);
|
||||
&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
|
||||
|
||||
&mov ("ecx","ebp");
|
||||
&lea ("esi",&DWP(64,"esp")); # tp
|
||||
# edi still points at the end of padded np copy...
|
||||
&neg ("ebp");
|
||||
&lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
|
||||
&mov ("edi",$rp); # restore rp
|
||||
&xor ("edx","edx"); # i=0 and clear CF
|
||||
|
||||
&set_label("sub",8);
|
||||
&mov ("eax",&DWP(0,"esi","edx",4));
|
||||
&sbb ("eax",&DWP(0,"ebp","edx",4));
|
||||
&mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
|
||||
&lea ("edx",&DWP(1,"edx")); # i++
|
||||
&loop (&label("sub")); # doesn't affect CF!
|
||||
|
||||
&mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
|
||||
&sbb ("eax",0);
|
||||
&and ("esi","eax");
|
||||
¬ ("eax");
|
||||
&mov ("ebp","edi");
|
||||
&and ("ebp","eax");
|
||||
&or ("esi","ebp"); # tp=carry?tp:rp
|
||||
|
||||
&mov ("ecx","edx"); # num
|
||||
&xor ("edx","edx"); # i=0
|
||||
|
||||
&set_label("copy",8);
|
||||
&mov ("eax",&DWP(0,"esi","edx",4));
|
||||
&mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
|
||||
&mov (&DWP(0,"edi","edx",4),"eax");
|
||||
&lea ("edx",&DWP(1,"edx")); # i++
|
||||
&loop (&label("copy"));
|
||||
|
||||
&mov ("ebp",$sp);
|
||||
&xor ("eax","eax");
|
||||
|
||||
&mov ("ecx",64/4);
|
||||
&mov ("edi","esp"); # zap frame including scratch area
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero
|
||||
|
||||
# zap copies of ap, bp and np
|
||||
&lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
|
||||
&lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
|
||||
&data_byte(0xf3,0xab); # rep stosl, bzero
|
||||
|
||||
&mov ("esp","ebp");
|
||||
&inc ("eax"); # signal "done"
|
||||
&popf ();
|
||||
&set_label("leave");
|
||||
&function_end($func);
|
||||
|
||||
&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
|
||||
&asm_finish();
|
||||
373
openssl-1.0.2f/crypto/bn/asm/vis3-mont.pl
Normal file
373
openssl-1.0.2f/crypto/bn/asm/vis3-mont.pl
Normal file
@@ -0,0 +1,373 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# October 2012.
|
||||
#
|
||||
# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
|
||||
# onward. There are three new instructions used here: umulxhi,
|
||||
# addxc[cc] and initializing store. On T3 RSA private key operations
|
||||
# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
|
||||
# lengths. This is without dedicated squaring procedure. On T4
|
||||
# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
|
||||
# for reference purposes, because T4 has dedicated Montgomery
|
||||
# multiplication and squaring *instructions* that deliver even more.
|
||||
|
||||
$bits=32;
|
||||
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
|
||||
if ($bits==64) { $bias=2047; $frame=192; }
|
||||
else { $bias=0; $frame=112; }
|
||||
|
||||
$code.=<<___ if ($bits==64);
|
||||
.register %g2,#scratch
|
||||
.register %g3,#scratch
|
||||
___
|
||||
$code.=<<___;
|
||||
.section ".text",#alloc,#execinstr
|
||||
___
|
||||
|
||||
($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
|
||||
(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
|
||||
|
||||
# int bn_mul_mont(
|
||||
$rp="%o0"; # BN_ULONG *rp,
|
||||
$ap="%o1"; # const BN_ULONG *ap,
|
||||
$bp="%o2"; # const BN_ULONG *bp,
|
||||
$np="%o3"; # const BN_ULONG *np,
|
||||
$n0p="%o4"; # const BN_ULONG *n0,
|
||||
$num="%o5"; # int num); # caller ensures that num is even
|
||||
# and >=6
|
||||
$code.=<<___;
|
||||
.globl bn_mul_mont_vis3
|
||||
.align 32
|
||||
bn_mul_mont_vis3:
|
||||
add %sp, $bias, %g4 ! real top of stack
|
||||
sll $num, 2, $num ! size in bytes
|
||||
add $num, 63, %g5
|
||||
andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
|
||||
add %g5, %g5, %g1
|
||||
add %g5, %g1, %g1 ! 3*buffer size
|
||||
sub %g4, %g1, %g1
|
||||
andn %g1, 63, %g1 ! align at 64 byte
|
||||
sub %g1, $frame, %g1 ! new top of stack
|
||||
sub %g1, %g4, %g1
|
||||
|
||||
save %sp, %g1, %sp
|
||||
___
|
||||
|
||||
# +-------------------------------+<----- %sp
|
||||
# . .
|
||||
# +-------------------------------+<----- aligned at 64 bytes
|
||||
# | __int64 tmp[0] |
|
||||
# +-------------------------------+
|
||||
# . .
|
||||
# . .
|
||||
# +-------------------------------+<----- aligned at 64 bytes
|
||||
# | __int64 ap[1..0] | converted ap[]
|
||||
# +-------------------------------+
|
||||
# | __int64 np[1..0] | converted np[]
|
||||
# +-------------------------------+
|
||||
# | __int64 ap[3..2] |
|
||||
# . .
|
||||
# . .
|
||||
# +-------------------------------+
|
||||
($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
|
||||
($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
|
||||
($ovf,$i)=($t0,$t1);
|
||||
$code.=<<___;
|
||||
ld [$n0p+0], $t0 ! pull n0[0..1] value
|
||||
add %sp, $bias+$frame, $tp
|
||||
ld [$n0p+4], $t1
|
||||
add $tp, %g5, $anp
|
||||
ld [$bp+0], $t2 ! m0=bp[0]
|
||||
sllx $t1, 32, $n0
|
||||
ld [$bp+4], $t3
|
||||
or $t0, $n0, $n0
|
||||
add $bp, 8, $bp
|
||||
|
||||
ld [$ap+0], $t0 ! ap[0]
|
||||
sllx $t3, 32, $m0
|
||||
ld [$ap+4], $t1
|
||||
or $t2, $m0, $m0
|
||||
|
||||
ld [$ap+8], $t2 ! ap[1]
|
||||
sllx $t1, 32, $aj
|
||||
ld [$ap+12], $t3
|
||||
or $t0, $aj, $aj
|
||||
add $ap, 16, $ap
|
||||
stx $aj, [$anp] ! converted ap[0]
|
||||
|
||||
mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
|
||||
umulxhi $aj, $m0, $hi0
|
||||
|
||||
ld [$np+0], $t0 ! np[0]
|
||||
sllx $t3, 32, $aj
|
||||
ld [$np+4], $t1
|
||||
or $t2, $aj, $aj
|
||||
|
||||
ld [$np+8], $t2 ! np[1]
|
||||
sllx $t1, 32, $nj
|
||||
ld [$np+12], $t3
|
||||
or $t0, $nj, $nj
|
||||
add $np, 16, $np
|
||||
stx $nj, [$anp+8] ! converted np[0]
|
||||
|
||||
mulx $lo0, $n0, $m1 ! "tp[0]"*n0
|
||||
stx $aj, [$anp+16] ! converted ap[1]
|
||||
|
||||
mulx $aj, $m0, $alo ! ap[1]*bp[0]
|
||||
umulxhi $aj, $m0, $aj ! ahi=aj
|
||||
|
||||
mulx $nj, $m1, $lo1 ! np[0]*m1
|
||||
umulxhi $nj, $m1, $hi1
|
||||
|
||||
sllx $t3, 32, $nj
|
||||
or $t2, $nj, $nj
|
||||
stx $nj, [$anp+24] ! converted np[1]
|
||||
add $anp, 32, $anp
|
||||
|
||||
addcc $lo0, $lo1, $lo1
|
||||
addxc %g0, $hi1, $hi1
|
||||
|
||||
mulx $nj, $m1, $nlo ! np[1]*m1
|
||||
umulxhi $nj, $m1, $nj ! nhi=nj
|
||||
|
||||
ba .L1st
|
||||
sub $num, 24, $cnt ! cnt=num-3
|
||||
|
||||
.align 16
|
||||
.L1st:
|
||||
ld [$ap+0], $t0 ! ap[j]
|
||||
addcc $alo, $hi0, $lo0
|
||||
ld [$ap+4], $t1
|
||||
addxc $aj, %g0, $hi0
|
||||
|
||||
sllx $t1, 32, $aj
|
||||
add $ap, 8, $ap
|
||||
or $t0, $aj, $aj
|
||||
stx $aj, [$anp] ! converted ap[j]
|
||||
|
||||
ld [$np+0], $t2 ! np[j]
|
||||
addcc $nlo, $hi1, $lo1
|
||||
ld [$np+4], $t3
|
||||
addxc $nj, %g0, $hi1 ! nhi=nj
|
||||
|
||||
sllx $t3, 32, $nj
|
||||
add $np, 8, $np
|
||||
mulx $aj, $m0, $alo ! ap[j]*bp[0]
|
||||
or $t2, $nj, $nj
|
||||
umulxhi $aj, $m0, $aj ! ahi=aj
|
||||
stx $nj, [$anp+8] ! converted np[j]
|
||||
add $anp, 16, $anp ! anp++
|
||||
|
||||
mulx $nj, $m1, $nlo ! np[j]*m1
|
||||
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
|
||||
umulxhi $nj, $m1, $nj ! nhi=nj
|
||||
addxc %g0, $hi1, $hi1
|
||||
stx $lo1, [$tp] ! tp[j-1]
|
||||
add $tp, 8, $tp ! tp++
|
||||
|
||||
brnz,pt $cnt, .L1st
|
||||
sub $cnt, 8, $cnt ! j--
|
||||
!.L1st
|
||||
addcc $alo, $hi0, $lo0
|
||||
addxc $aj, %g0, $hi0 ! ahi=aj
|
||||
|
||||
addcc $nlo, $hi1, $lo1
|
||||
addxc $nj, %g0, $hi1
|
||||
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
|
||||
addxc %g0, $hi1, $hi1
|
||||
stx $lo1, [$tp] ! tp[j-1]
|
||||
add $tp, 8, $tp
|
||||
|
||||
addcc $hi0, $hi1, $hi1
|
||||
addxc %g0, %g0, $ovf ! upmost overflow bit
|
||||
stx $hi1, [$tp]
|
||||
add $tp, 8, $tp
|
||||
|
||||
ba .Louter
|
||||
sub $num, 16, $i ! i=num-2
|
||||
|
||||
.align 16
|
||||
.Louter:
|
||||
ld [$bp+0], $t2 ! m0=bp[i]
|
||||
ld [$bp+4], $t3
|
||||
|
||||
sub $anp, $num, $anp ! rewind
|
||||
sub $tp, $num, $tp
|
||||
sub $anp, $num, $anp
|
||||
|
||||
add $bp, 8, $bp
|
||||
sllx $t3, 32, $m0
|
||||
ldx [$anp+0], $aj ! ap[0]
|
||||
or $t2, $m0, $m0
|
||||
ldx [$anp+8], $nj ! np[0]
|
||||
|
||||
mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
|
||||
ldx [$tp], $tj ! tp[0]
|
||||
umulxhi $aj, $m0, $hi0
|
||||
ldx [$anp+16], $aj ! ap[1]
|
||||
addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
|
||||
mulx $aj, $m0, $alo ! ap[1]*bp[i]
|
||||
addxc %g0, $hi0, $hi0
|
||||
mulx $lo0, $n0, $m1 ! tp[0]*n0
|
||||
umulxhi $aj, $m0, $aj ! ahi=aj
|
||||
mulx $nj, $m1, $lo1 ! np[0]*m1
|
||||
umulxhi $nj, $m1, $hi1
|
||||
ldx [$anp+24], $nj ! np[1]
|
||||
add $anp, 32, $anp
|
||||
addcc $lo1, $lo0, $lo1
|
||||
mulx $nj, $m1, $nlo ! np[1]*m1
|
||||
addxc %g0, $hi1, $hi1
|
||||
umulxhi $nj, $m1, $nj ! nhi=nj
|
||||
|
||||
ba .Linner
|
||||
sub $num, 24, $cnt ! cnt=num-3
|
||||
.align 16
|
||||
.Linner:
|
||||
addcc $alo, $hi0, $lo0
|
||||
ldx [$tp+8], $tj ! tp[j]
|
||||
addxc $aj, %g0, $hi0 ! ahi=aj
|
||||
ldx [$anp+0], $aj ! ap[j]
|
||||
addcc $nlo, $hi1, $lo1
|
||||
mulx $aj, $m0, $alo ! ap[j]*bp[i]
|
||||
addxc $nj, %g0, $hi1 ! nhi=nj
|
||||
ldx [$anp+8], $nj ! np[j]
|
||||
add $anp, 16, $anp
|
||||
umulxhi $aj, $m0, $aj ! ahi=aj
|
||||
addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
|
||||
mulx $nj, $m1, $nlo ! np[j]*m1
|
||||
addxc %g0, $hi0, $hi0
|
||||
umulxhi $nj, $m1, $nj ! nhi=nj
|
||||
addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
addxc %g0, $hi1, $hi1
|
||||
stx $lo1, [$tp] ! tp[j-1]
|
||||
add $tp, 8, $tp
|
||||
brnz,pt $cnt, .Linner
|
||||
sub $cnt, 8, $cnt
|
||||
!.Linner
|
||||
ldx [$tp+8], $tj ! tp[j]
|
||||
addcc $alo, $hi0, $lo0
|
||||
addxc $aj, %g0, $hi0 ! ahi=aj
|
||||
addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
|
||||
addxc %g0, $hi0, $hi0
|
||||
|
||||
addcc $nlo, $hi1, $lo1
|
||||
addxc $nj, %g0, $hi1 ! nhi=nj
|
||||
addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
addxc %g0, $hi1, $hi1
|
||||
stx $lo1, [$tp] ! tp[j-1]
|
||||
|
||||
subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
|
||||
addxccc $hi1, $hi0, $hi1
|
||||
addxc %g0, %g0, $ovf
|
||||
stx $hi1, [$tp+8]
|
||||
add $tp, 16, $tp
|
||||
|
||||
brnz,pt $i, .Louter
|
||||
sub $i, 8, $i
|
||||
|
||||
sub $anp, $num, $anp ! rewind
|
||||
sub $tp, $num, $tp
|
||||
sub $anp, $num, $anp
|
||||
ba .Lsub
|
||||
subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
|
||||
|
||||
.align 16
|
||||
.Lsub:
|
||||
ldx [$tp], $tj
|
||||
add $tp, 8, $tp
|
||||
ldx [$anp+8], $nj
|
||||
add $anp, 16, $anp
|
||||
subccc $tj, $nj, $t2 ! tp[j]-np[j]
|
||||
srlx $tj, 32, $tj
|
||||
srlx $nj, 32, $nj
|
||||
subccc $tj, $nj, $t3
|
||||
add $rp, 8, $rp
|
||||
st $t2, [$rp-4] ! reverse order
|
||||
st $t3, [$rp-8]
|
||||
brnz,pt $cnt, .Lsub
|
||||
sub $cnt, 8, $cnt
|
||||
|
||||
sub $anp, $num, $anp ! rewind
|
||||
sub $tp, $num, $tp
|
||||
sub $anp, $num, $anp
|
||||
sub $rp, $num, $rp
|
||||
|
||||
subc $ovf, %g0, $ovf ! handle upmost overflow bit
|
||||
and $tp, $ovf, $ap
|
||||
andn $rp, $ovf, $np
|
||||
or $np, $ap, $ap ! ap=borrow?tp:rp
|
||||
ba .Lcopy
|
||||
sub $num, 8, $cnt
|
||||
|
||||
.align 16
|
||||
.Lcopy: ! copy or in-place refresh
|
||||
ld [$ap+0], $t2
|
||||
ld [$ap+4], $t3
|
||||
add $ap, 8, $ap
|
||||
stx %g0, [$tp] ! zap
|
||||
add $tp, 8, $tp
|
||||
stx %g0, [$anp] ! zap
|
||||
stx %g0, [$anp+8]
|
||||
add $anp, 16, $anp
|
||||
st $t3, [$rp+0] ! flip order
|
||||
st $t2, [$rp+4]
|
||||
add $rp, 8, $rp
|
||||
brnz $cnt, .Lcopy
|
||||
sub $cnt, 8, $cnt
|
||||
|
||||
mov 1, %o0
|
||||
ret
|
||||
restore
|
||||
.type bn_mul_mont_vis3, #function
|
||||
.size bn_mul_mont_vis3, .-bn_mul_mont_vis3
|
||||
.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 4
|
||||
___
|
||||
|
||||
# Purpose of these subroutines is to explicitly encode VIS instructions,
|
||||
# so that one can compile the module without having to specify VIS
|
||||
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
|
||||
# Idea is to reserve for option to produce "universal" binary and let
|
||||
# programmer detect if current CPU is VIS capable at run-time.
|
||||
sub unvis3 {
|
||||
my ($mnemonic,$rs1,$rs2,$rd)=@_;
|
||||
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
|
||||
my ($ref,$opf);
|
||||
my %visopf = ( "addxc" => 0x011,
|
||||
"addxccc" => 0x013,
|
||||
"umulxhi" => 0x016 );
|
||||
|
||||
$ref = "$mnemonic\t$rs1,$rs2,$rd";
|
||||
|
||||
if ($opf=$visopf{$mnemonic}) {
|
||||
foreach ($rs1,$rs2,$rd) {
|
||||
return $ref if (!/%([goli])([0-9])/);
|
||||
$_=$bias{$1}+$2;
|
||||
}
|
||||
|
||||
return sprintf ".word\t0x%08x !%s",
|
||||
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
|
||||
$ref;
|
||||
} else {
|
||||
return $ref;
|
||||
}
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
|
||||
s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
|
||||
&unvis3($1,$2,$3,$4)
|
||||
/ge;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT;
|
||||
6440
openssl-1.0.2f/crypto/bn/asm/vms.mar
Normal file
6440
openssl-1.0.2f/crypto/bn/asm/vms.mar
Normal file
File diff suppressed because it is too large
Load Diff
313
openssl-1.0.2f/crypto/bn/asm/x86-gf2m.pl
Normal file
313
openssl-1.0.2f/crypto/bn/asm/x86-gf2m.pl
Normal file
@@ -0,0 +1,313 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# May 2011
|
||||
#
|
||||
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
|
||||
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
|
||||
# the time being... Except that it has three code paths: pure integer
|
||||
# code suitable for any x86 CPU, MMX code suitable for PIII and later
|
||||
# and PCLMULQDQ suitable for Westmere and later. Improvement varies
|
||||
# from one benchmark and µ-arch to another. Below are interval values
|
||||
# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
|
||||
# code:
|
||||
#
|
||||
# PIII 16%-30%
|
||||
# P4 12%-12%
|
||||
# Opteron 18%-40%
|
||||
# Core2 19%-44%
|
||||
# Atom 38%-64%
|
||||
# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
|
||||
# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
|
||||
#
|
||||
# Note that above improvement coefficients are not coefficients for
|
||||
# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
|
||||
# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
|
||||
# is more and more dominated by other subroutines, most notably by
|
||||
# BN_GF2m_mod[_mul]_arr...
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
|
||||
|
||||
$sse2=0;
|
||||
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
|
||||
&external_label("OPENSSL_ia32cap_P") if ($sse2);
|
||||
|
||||
$a="eax";
|
||||
$b="ebx";
|
||||
($a1,$a2,$a4)=("ecx","edx","ebp");
|
||||
|
||||
$R="mm0";
|
||||
@T=("mm1","mm2");
|
||||
($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
|
||||
@i=("esi","edi");
|
||||
|
||||
if (!$x86only) {
|
||||
&function_begin_B("_mul_1x1_mmx");
|
||||
&sub ("esp",32+4);
|
||||
&mov ($a1,$a);
|
||||
&lea ($a2,&DWP(0,$a,$a));
|
||||
&and ($a1,0x3fffffff);
|
||||
&lea ($a4,&DWP(0,$a2,$a2));
|
||||
&mov (&DWP(0*4,"esp"),0);
|
||||
&and ($a2,0x7fffffff);
|
||||
&movd ($A,$a);
|
||||
&movd ($B,$b);
|
||||
&mov (&DWP(1*4,"esp"),$a1); # a1
|
||||
&xor ($a1,$a2); # a1^a2
|
||||
&pxor ($B31,$B31);
|
||||
&pxor ($B30,$B30);
|
||||
&mov (&DWP(2*4,"esp"),$a2); # a2
|
||||
&xor ($a2,$a4); # a2^a4
|
||||
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
|
||||
&pcmpgtd($B31,$A); # broadcast 31st bit
|
||||
&paddd ($A,$A); # $A<<=1
|
||||
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
|
||||
&mov (&DWP(4*4,"esp"),$a4); # a4
|
||||
&xor ($a4,$a2); # a2=a4^a2^a4
|
||||
&pand ($B31,$B);
|
||||
&pcmpgtd($B30,$A); # broadcast 30th bit
|
||||
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
|
||||
&xor ($a4,$a1); # a1^a2^a4
|
||||
&psllq ($B31,31);
|
||||
&pand ($B30,$B);
|
||||
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
|
||||
&mov (@i[0],0x7);
|
||||
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
|
||||
&mov ($a4,@i[0]);
|
||||
&and (@i[0],$b);
|
||||
&shr ($b,3);
|
||||
&mov (@i[1],$a4);
|
||||
&psllq ($B30,30);
|
||||
&and (@i[1],$b);
|
||||
&shr ($b,3);
|
||||
&movd ($R,&DWP(0,"esp",@i[0],4));
|
||||
&mov (@i[0],$a4);
|
||||
&and (@i[0],$b);
|
||||
&shr ($b,3);
|
||||
for($n=1;$n<9;$n++) {
|
||||
&movd (@T[1],&DWP(0,"esp",@i[1],4));
|
||||
&mov (@i[1],$a4);
|
||||
&psllq (@T[1],3*$n);
|
||||
&and (@i[1],$b);
|
||||
&shr ($b,3);
|
||||
&pxor ($R,@T[1]);
|
||||
|
||||
push(@i,shift(@i)); push(@T,shift(@T));
|
||||
}
|
||||
&movd (@T[1],&DWP(0,"esp",@i[1],4));
|
||||
&pxor ($R,$B30);
|
||||
&psllq (@T[1],3*$n++);
|
||||
&pxor ($R,@T[1]);
|
||||
|
||||
&movd (@T[0],&DWP(0,"esp",@i[0],4));
|
||||
&pxor ($R,$B31);
|
||||
&psllq (@T[0],3*$n);
|
||||
&add ("esp",32+4);
|
||||
&pxor ($R,@T[0]);
|
||||
&ret ();
|
||||
&function_end_B("_mul_1x1_mmx");
|
||||
}
|
||||
|
||||
($lo,$hi)=("eax","edx");
|
||||
@T=("ecx","ebp");
|
||||
|
||||
&function_begin_B("_mul_1x1_ialu");
|
||||
&sub ("esp",32+4);
|
||||
&mov ($a1,$a);
|
||||
&lea ($a2,&DWP(0,$a,$a));
|
||||
&lea ($a4,&DWP(0,"",$a,4));
|
||||
&and ($a1,0x3fffffff);
|
||||
&lea (@i[1],&DWP(0,$lo,$lo));
|
||||
&sar ($lo,31); # broadcast 31st bit
|
||||
&mov (&DWP(0*4,"esp"),0);
|
||||
&and ($a2,0x7fffffff);
|
||||
&mov (&DWP(1*4,"esp"),$a1); # a1
|
||||
&xor ($a1,$a2); # a1^a2
|
||||
&mov (&DWP(2*4,"esp"),$a2); # a2
|
||||
&xor ($a2,$a4); # a2^a4
|
||||
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
|
||||
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
|
||||
&mov (&DWP(4*4,"esp"),$a4); # a4
|
||||
&xor ($a4,$a2); # a2=a4^a2^a4
|
||||
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
|
||||
&xor ($a4,$a1); # a1^a2^a4
|
||||
&sar (@i[1],31); # broardcast 30th bit
|
||||
&and ($lo,$b);
|
||||
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
|
||||
&and (@i[1],$b);
|
||||
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
|
||||
&mov ($hi,$lo);
|
||||
&shl ($lo,31);
|
||||
&mov (@T[0],@i[1]);
|
||||
&shr ($hi,1);
|
||||
|
||||
&mov (@i[0],0x7);
|
||||
&shl (@i[1],30);
|
||||
&and (@i[0],$b);
|
||||
&shr (@T[0],2);
|
||||
&xor ($lo,@i[1]);
|
||||
|
||||
&shr ($b,3);
|
||||
&mov (@i[1],0x7); # 5-byte instruction!?
|
||||
&and (@i[1],$b);
|
||||
&shr ($b,3);
|
||||
&xor ($hi,@T[0]);
|
||||
&xor ($lo,&DWP(0,"esp",@i[0],4));
|
||||
&mov (@i[0],0x7);
|
||||
&and (@i[0],$b);
|
||||
&shr ($b,3);
|
||||
for($n=1;$n<9;$n++) {
|
||||
&mov (@T[1],&DWP(0,"esp",@i[1],4));
|
||||
&mov (@i[1],0x7);
|
||||
&mov (@T[0],@T[1]);
|
||||
&shl (@T[1],3*$n);
|
||||
&and (@i[1],$b);
|
||||
&shr (@T[0],32-3*$n);
|
||||
&xor ($lo,@T[1]);
|
||||
&shr ($b,3);
|
||||
&xor ($hi,@T[0]);
|
||||
|
||||
push(@i,shift(@i)); push(@T,shift(@T));
|
||||
}
|
||||
&mov (@T[1],&DWP(0,"esp",@i[1],4));
|
||||
&mov (@T[0],@T[1]);
|
||||
&shl (@T[1],3*$n);
|
||||
&mov (@i[1],&DWP(0,"esp",@i[0],4));
|
||||
&shr (@T[0],32-3*$n); $n++;
|
||||
&mov (@i[0],@i[1]);
|
||||
&xor ($lo,@T[1]);
|
||||
&shl (@i[1],3*$n);
|
||||
&xor ($hi,@T[0]);
|
||||
&shr (@i[0],32-3*$n);
|
||||
&xor ($lo,@i[1]);
|
||||
&xor ($hi,@i[0]);
|
||||
|
||||
&add ("esp",32+4);
|
||||
&ret ();
|
||||
&function_end_B("_mul_1x1_ialu");
|
||||
|
||||
# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
|
||||
&function_begin_B("bn_GF2m_mul_2x2");
|
||||
if (!$x86only) {
|
||||
&picmeup("edx","OPENSSL_ia32cap_P");
|
||||
&mov ("eax",&DWP(0,"edx"));
|
||||
&mov ("edx",&DWP(4,"edx"));
|
||||
&test ("eax",1<<23); # check MMX bit
|
||||
&jz (&label("ialu"));
|
||||
if ($sse2) {
|
||||
&test ("eax",1<<24); # check FXSR bit
|
||||
&jz (&label("mmx"));
|
||||
&test ("edx",1<<1); # check PCLMULQDQ bit
|
||||
&jz (&label("mmx"));
|
||||
|
||||
&movups ("xmm0",&QWP(8,"esp"));
|
||||
&shufps ("xmm0","xmm0",0b10110001);
|
||||
&pclmulqdq ("xmm0","xmm0",1);
|
||||
&mov ("eax",&DWP(4,"esp"));
|
||||
&movups (&QWP(0,"eax"),"xmm0");
|
||||
&ret ();
|
||||
|
||||
&set_label("mmx",16);
|
||||
}
|
||||
&push ("ebp");
|
||||
&push ("ebx");
|
||||
&push ("esi");
|
||||
&push ("edi");
|
||||
&mov ($a,&wparam(1));
|
||||
&mov ($b,&wparam(3));
|
||||
&call ("_mul_1x1_mmx"); # a1·b1
|
||||
&movq ("mm7",$R);
|
||||
|
||||
&mov ($a,&wparam(2));
|
||||
&mov ($b,&wparam(4));
|
||||
&call ("_mul_1x1_mmx"); # a0·b0
|
||||
&movq ("mm6",$R);
|
||||
|
||||
&mov ($a,&wparam(1));
|
||||
&mov ($b,&wparam(3));
|
||||
&xor ($a,&wparam(2));
|
||||
&xor ($b,&wparam(4));
|
||||
&call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
|
||||
&pxor ($R,"mm7");
|
||||
&mov ($a,&wparam(0));
|
||||
&pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
|
||||
|
||||
&movq ($A,$R);
|
||||
&psllq ($R,32);
|
||||
&pop ("edi");
|
||||
&psrlq ($A,32);
|
||||
&pop ("esi");
|
||||
&pxor ($R,"mm6");
|
||||
&pop ("ebx");
|
||||
&pxor ($A,"mm7");
|
||||
&movq (&QWP(0,$a),$R);
|
||||
&pop ("ebp");
|
||||
&movq (&QWP(8,$a),$A);
|
||||
&emms ();
|
||||
&ret ();
|
||||
&set_label("ialu",16);
|
||||
}
|
||||
&push ("ebp");
|
||||
&push ("ebx");
|
||||
&push ("esi");
|
||||
&push ("edi");
|
||||
&stack_push(4+1);
|
||||
|
||||
&mov ($a,&wparam(1));
|
||||
&mov ($b,&wparam(3));
|
||||
&call ("_mul_1x1_ialu"); # a1·b1
|
||||
&mov (&DWP(8,"esp"),$lo);
|
||||
&mov (&DWP(12,"esp"),$hi);
|
||||
|
||||
&mov ($a,&wparam(2));
|
||||
&mov ($b,&wparam(4));
|
||||
&call ("_mul_1x1_ialu"); # a0·b0
|
||||
&mov (&DWP(0,"esp"),$lo);
|
||||
&mov (&DWP(4,"esp"),$hi);
|
||||
|
||||
&mov ($a,&wparam(1));
|
||||
&mov ($b,&wparam(3));
|
||||
&xor ($a,&wparam(2));
|
||||
&xor ($b,&wparam(4));
|
||||
&call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
|
||||
|
||||
&mov ("ebp",&wparam(0));
|
||||
@r=("ebx","ecx","edi","esi");
|
||||
&mov (@r[0],&DWP(0,"esp"));
|
||||
&mov (@r[1],&DWP(4,"esp"));
|
||||
&mov (@r[2],&DWP(8,"esp"));
|
||||
&mov (@r[3],&DWP(12,"esp"));
|
||||
|
||||
&xor ($lo,$hi);
|
||||
&xor ($hi,@r[1]);
|
||||
&xor ($lo,@r[0]);
|
||||
&mov (&DWP(0,"ebp"),@r[0]);
|
||||
&xor ($hi,@r[2]);
|
||||
&mov (&DWP(12,"ebp"),@r[3]);
|
||||
&xor ($lo,@r[3]);
|
||||
&stack_pop(4+1);
|
||||
&xor ($hi,@r[3]);
|
||||
&pop ("edi");
|
||||
&xor ($lo,$hi);
|
||||
&pop ("esi");
|
||||
&mov (&DWP(8,"ebp"),$hi);
|
||||
&pop ("ebx");
|
||||
&mov (&DWP(4,"ebp"),$lo);
|
||||
&pop ("ebp");
|
||||
&ret ();
|
||||
&function_end_B("bn_GF2m_mul_2x2");
|
||||
|
||||
&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
|
||||
&asm_finish();
|
||||
593
openssl-1.0.2f/crypto/bn/asm/x86-mont.pl
Executable file
593
openssl-1.0.2f/crypto/bn/asm/x86-mont.pl
Executable file
@@ -0,0 +1,593 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# October 2005
|
||||
#
|
||||
# This is a "teaser" code, as it can be improved in several ways...
|
||||
# First of all non-SSE2 path should be implemented (yes, for now it
|
||||
# performs Montgomery multiplication/convolution only on SSE2-capable
|
||||
# CPUs such as P4, others fall down to original code). Then inner loop
|
||||
# can be unrolled and modulo-scheduled to improve ILP and possibly
|
||||
# moved to 128-bit XMM register bank (though it would require input
|
||||
# rearrangement and/or increase bus bandwidth utilization). Dedicated
|
||||
# squaring procedure should give further performance improvement...
|
||||
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
|
||||
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
|
||||
|
||||
# December 2006
|
||||
#
|
||||
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
|
||||
# Integer-only code [being equipped with dedicated squaring procedure]
|
||||
# gives ~40% on rsa512 sign benchmark...
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
$sse2=0;
|
||||
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
|
||||
&external_label("OPENSSL_ia32cap_P") if ($sse2);
|
||||
|
||||
&function_begin("bn_mul_mont");
|
||||
|
||||
$i="edx";
|
||||
$j="ecx";
|
||||
$ap="esi"; $tp="esi"; # overlapping variables!!!
|
||||
$rp="edi"; $bp="edi"; # overlapping variables!!!
|
||||
$np="ebp";
|
||||
$num="ebx";
|
||||
|
||||
$_num=&DWP(4*0,"esp"); # stack top layout
|
||||
$_rp=&DWP(4*1,"esp");
|
||||
$_ap=&DWP(4*2,"esp");
|
||||
$_bp=&DWP(4*3,"esp");
|
||||
$_np=&DWP(4*4,"esp");
|
||||
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
|
||||
$_sp=&DWP(4*6,"esp");
|
||||
$_bpend=&DWP(4*7,"esp");
|
||||
$frame=32; # size of above frame rounded up to 16n
|
||||
|
||||
&xor ("eax","eax");
|
||||
&mov ("edi",&wparam(5)); # int num
|
||||
&cmp ("edi",4);
|
||||
&jl (&label("just_leave"));
|
||||
|
||||
&lea ("esi",&wparam(0)); # put aside pointer to argument block
|
||||
&lea ("edx",&wparam(1)); # load ap
|
||||
&mov ("ebp","esp"); # saved stack pointer!
|
||||
&add ("edi",2); # extra two words on top of tp
|
||||
&neg ("edi");
|
||||
&lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
|
||||
&neg ("edi");
|
||||
|
||||
# minimize cache contention by arraning 2K window between stack
|
||||
# pointer and ap argument [np is also position sensitive vector,
|
||||
# but it's assumed to be near ap, as it's allocated at ~same
|
||||
# time].
|
||||
&mov ("eax","esp");
|
||||
&sub ("eax","edx");
|
||||
&and ("eax",2047);
|
||||
&sub ("esp","eax"); # this aligns sp and ap modulo 2048
|
||||
|
||||
&xor ("edx","esp");
|
||||
&and ("edx",2048);
|
||||
&xor ("edx",2048);
|
||||
&sub ("esp","edx"); # this splits them apart modulo 4096
|
||||
|
||||
&and ("esp",-64); # align to cache line
|
||||
|
||||
################################# load argument block...
|
||||
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
|
||||
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
|
||||
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
|
||||
&mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
|
||||
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
|
||||
#&mov ("edi",&DWP(5*4,"esi"));# int num
|
||||
|
||||
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
|
||||
&mov ($_rp,"eax"); # ... save a copy of argument block
|
||||
&mov ($_ap,"ebx");
|
||||
&mov ($_bp,"ecx");
|
||||
&mov ($_np,"edx");
|
||||
&mov ($_n0,"esi");
|
||||
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
|
||||
#&mov ($_num,$num); # redundant as $num is not reused
|
||||
&mov ($_sp,"ebp"); # saved stack pointer!
|
||||
|
||||
if($sse2) {
|
||||
$acc0="mm0"; # mmx register bank layout
|
||||
$acc1="mm1";
|
||||
$car0="mm2";
|
||||
$car1="mm3";
|
||||
$mul0="mm4";
|
||||
$mul1="mm5";
|
||||
$temp="mm6";
|
||||
$mask="mm7";
|
||||
|
||||
&picmeup("eax","OPENSSL_ia32cap_P");
|
||||
&bt (&DWP(0,"eax"),26);
|
||||
&jnc (&label("non_sse2"));
|
||||
|
||||
&mov ("eax",-1);
|
||||
&movd ($mask,"eax"); # mask 32 lower bits
|
||||
|
||||
&mov ($ap,$_ap); # load input pointers
|
||||
&mov ($bp,$_bp);
|
||||
&mov ($np,$_np);
|
||||
|
||||
&xor ($i,$i); # i=0
|
||||
&xor ($j,$j); # j=0
|
||||
|
||||
&movd ($mul0,&DWP(0,$bp)); # bp[0]
|
||||
&movd ($mul1,&DWP(0,$ap)); # ap[0]
|
||||
&movd ($car1,&DWP(0,$np)); # np[0]
|
||||
|
||||
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
|
||||
&movq ($car0,$mul1);
|
||||
&movq ($acc0,$mul1); # I wish movd worked for
|
||||
&pand ($acc0,$mask); # inter-register transfers
|
||||
|
||||
&pmuludq($mul1,$_n0q); # *=n0
|
||||
|
||||
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
|
||||
&paddq ($car1,$acc0);
|
||||
|
||||
&movd ($acc1,&DWP(4,$np)); # np[1]
|
||||
&movd ($acc0,&DWP(4,$ap)); # ap[1]
|
||||
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&inc ($j); # j++
|
||||
&set_label("1st",16);
|
||||
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
|
||||
&pmuludq($acc1,$mul1); # np[j]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&pand ($acc0,$mask);
|
||||
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
|
||||
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
|
||||
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
|
||||
&psrlq ($car0,32);
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&cmp ($j,$num);
|
||||
&jl (&label("1st"));
|
||||
|
||||
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
|
||||
&pmuludq($acc1,$mul1); # np[num-1]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&pand ($acc0,$mask);
|
||||
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
|
||||
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&paddq ($car1,$car0);
|
||||
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
|
||||
|
||||
&inc ($i); # i++
|
||||
&set_label("outer");
|
||||
&xor ($j,$j); # j=0
|
||||
|
||||
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
|
||||
&movd ($mul1,&DWP(0,$ap)); # ap[0]
|
||||
&movd ($temp,&DWP($frame,"esp")); # tp[0]
|
||||
&movd ($car1,&DWP(0,$np)); # np[0]
|
||||
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
|
||||
|
||||
&paddq ($mul1,$temp); # +=tp[0]
|
||||
&movq ($acc0,$mul1);
|
||||
&movq ($car0,$mul1);
|
||||
&pand ($acc0,$mask);
|
||||
|
||||
&pmuludq($mul1,$_n0q); # *=n0
|
||||
|
||||
&pmuludq($car1,$mul1);
|
||||
&paddq ($car1,$acc0);
|
||||
|
||||
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
|
||||
&movd ($acc1,&DWP(4,$np)); # np[1]
|
||||
&movd ($acc0,&DWP(4,$ap)); # ap[1]
|
||||
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
&paddq ($car0,$temp); # +=tp[1]
|
||||
|
||||
&inc ($j); # j++
|
||||
&dec ($num);
|
||||
&set_label("inner");
|
||||
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
|
||||
&pmuludq($acc1,$mul1); # np[j]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
|
||||
&pand ($acc0,$mask);
|
||||
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
|
||||
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
|
||||
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
|
||||
&psrlq ($car0,32);
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
|
||||
&psrlq ($car1,32);
|
||||
&paddq ($car0,$temp); # +=tp[j+1]
|
||||
|
||||
&dec ($num);
|
||||
&lea ($j,&DWP(1,$j)); # j++
|
||||
&jnz (&label("inner"));
|
||||
|
||||
&mov ($num,$j);
|
||||
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
|
||||
&pmuludq($acc1,$mul1); # np[num-1]*m1
|
||||
&paddq ($car0,$acc0); # +=c0
|
||||
&paddq ($car1,$acc1); # +=c1
|
||||
|
||||
&movq ($acc0,$car0);
|
||||
&pand ($acc0,$mask);
|
||||
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
|
||||
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
|
||||
&psrlq ($car0,32);
|
||||
&psrlq ($car1,32);
|
||||
|
||||
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
|
||||
&paddq ($car1,$car0);
|
||||
&paddq ($car1,$temp);
|
||||
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
|
||||
|
||||
&lea ($i,&DWP(1,$i)); # i++
|
||||
&cmp ($i,$num);
|
||||
&jle (&label("outer"));
|
||||
|
||||
&emms (); # done with mmx bank
|
||||
&jmp (&label("common_tail"));
|
||||
|
||||
&set_label("non_sse2",16);
|
||||
}
|
||||
|
||||
if (0) {
|
||||
&mov ("esp",$_sp);
|
||||
&xor ("eax","eax"); # signal "not fast enough [yet]"
|
||||
&jmp (&label("just_leave"));
|
||||
# While the below code provides competitive performance for
|
||||
# all key lengthes on modern Intel cores, it's still more
|
||||
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
|
||||
# means compared to the original integer-only assembler.
|
||||
# 512-bit RSA sign is better by ~40%, but that's about all
|
||||
# one can say about all CPUs...
|
||||
} else {
|
||||
$inp="esi"; # integer path uses these registers differently
|
||||
$word="edi";
|
||||
$carry="ebp";
|
||||
|
||||
&mov ($inp,$_ap);
|
||||
&lea ($carry,&DWP(1,$num));
|
||||
&mov ($word,$_bp);
|
||||
&xor ($j,$j); # j=0
|
||||
&mov ("edx",$inp);
|
||||
&and ($carry,1); # see if num is even
|
||||
&sub ("edx",$word); # see if ap==bp
|
||||
&lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
|
||||
&or ($carry,"edx");
|
||||
&mov ($word,&DWP(0,$word)); # bp[0]
|
||||
&jz (&label("bn_sqr_mont"));
|
||||
&mov ($_bpend,"eax");
|
||||
&mov ("eax",&DWP(0,$inp));
|
||||
&xor ("edx","edx");
|
||||
|
||||
&set_label("mull",16);
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[j]*bp[0]
|
||||
&add ($carry,"eax");
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&adc ("edx",0);
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
|
||||
&cmp ($j,$num);
|
||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
||||
&jl (&label("mull"));
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[num-1]*bp[0]
|
||||
&mov ($word,$_n0);
|
||||
&add ("eax",$carry);
|
||||
&mov ($inp,$_np);
|
||||
&adc ("edx",0);
|
||||
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
||||
|
||||
&mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
|
||||
&xor ($j,$j);
|
||||
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
|
||||
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
|
||||
|
||||
&mov ("eax",&DWP(0,$inp)); # np[0]
|
||||
&mul ($word); # np[0]*m
|
||||
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
||||
&mov ("eax",&DWP(4,$inp)); # np[1]
|
||||
&adc ("edx",0);
|
||||
&inc ($j);
|
||||
|
||||
&jmp (&label("2ndmadd"));
|
||||
|
||||
&set_label("1stmadd",16);
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[j]*bp[i]
|
||||
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
|
||||
&adc ("edx",0);
|
||||
&cmp ($j,$num);
|
||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
||||
&jl (&label("1stmadd"));
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[num-1]*bp[i]
|
||||
&add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
|
||||
&mov ($word,$_n0);
|
||||
&adc ("edx",0);
|
||||
&mov ($inp,$_np);
|
||||
&add ($carry,"eax");
|
||||
&adc ("edx",0);
|
||||
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
||||
|
||||
&xor ($j,$j);
|
||||
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
|
||||
&mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
|
||||
&adc ($j,0);
|
||||
&mov ("eax",&DWP(0,$inp)); # np[0]
|
||||
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
|
||||
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
|
||||
|
||||
&mul ($word); # np[0]*m
|
||||
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
||||
&mov ("eax",&DWP(4,$inp)); # np[1]
|
||||
&adc ("edx",0);
|
||||
&mov ($j,1);
|
||||
|
||||
&set_label("2ndmadd",16);
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # np[j]*m
|
||||
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
|
||||
&adc ("edx",0);
|
||||
&cmp ($j,$num);
|
||||
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
|
||||
&jl (&label("2ndmadd"));
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # np[j]*m
|
||||
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&adc ("edx",0);
|
||||
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
|
||||
|
||||
&xor ("eax","eax");
|
||||
&mov ($j,$_bp); # &bp[i]
|
||||
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
|
||||
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
|
||||
&lea ($j,&DWP(4,$j));
|
||||
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
|
||||
&cmp ($j,$_bpend);
|
||||
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
|
||||
&je (&label("common_tail"));
|
||||
|
||||
&mov ($word,&DWP(0,$j)); # bp[i+1]
|
||||
&mov ($inp,$_ap);
|
||||
&mov ($_bp,$j); # &bp[++i]
|
||||
&xor ($j,$j);
|
||||
&xor ("edx","edx");
|
||||
&mov ("eax",&DWP(0,$inp));
|
||||
&jmp (&label("1stmadd"));
|
||||
|
||||
&set_label("bn_sqr_mont",16);
|
||||
$sbit=$num;
|
||||
&mov ($_num,$num);
|
||||
&mov ($_bp,$j); # i=0
|
||||
|
||||
&mov ("eax",$word); # ap[0]
|
||||
&mul ($word); # ap[0]*ap[0]
|
||||
&mov (&DWP($frame,"esp"),"eax"); # tp[0]=
|
||||
&mov ($sbit,"edx");
|
||||
&shr ("edx",1);
|
||||
&and ($sbit,1);
|
||||
&inc ($j);
|
||||
&set_label("sqr",16);
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[j]*ap[0]
|
||||
&add ("eax",$carry);
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&adc ("edx",0);
|
||||
&lea ($carry,&DWP(0,$sbit,"eax",2));
|
||||
&shr ("eax",31);
|
||||
&cmp ($j,$_num);
|
||||
&mov ($sbit,"eax");
|
||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
||||
&jl (&label("sqr"));
|
||||
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[num-1]*ap[0]
|
||||
&add ("eax",$carry);
|
||||
&mov ($word,$_n0);
|
||||
&adc ("edx",0);
|
||||
&mov ($inp,$_np);
|
||||
&lea ($carry,&DWP(0,$sbit,"eax",2));
|
||||
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
||||
&shr ("eax",31);
|
||||
&mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
|
||||
|
||||
&lea ($carry,&DWP(0,"eax","edx",2));
|
||||
&mov ("eax",&DWP(0,$inp)); # np[0]
|
||||
&shr ("edx",31);
|
||||
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
|
||||
&mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
|
||||
|
||||
&mul ($word); # np[0]*m
|
||||
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
||||
&mov ($num,$j);
|
||||
&adc ("edx",0);
|
||||
&mov ("eax",&DWP(4,$inp)); # np[1]
|
||||
&mov ($j,1);
|
||||
|
||||
&set_label("3rdmadd",16);
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # np[j]*m
|
||||
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
|
||||
&adc ("edx",0);
|
||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # np[j+1]*m
|
||||
&add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
|
||||
&lea ($j,&DWP(2,$j));
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
|
||||
&adc ("edx",0);
|
||||
&cmp ($j,$num);
|
||||
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
|
||||
&jl (&label("3rdmadd"));
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # np[j]*m
|
||||
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
|
||||
&adc ("edx",0);
|
||||
&add ($carry,"eax");
|
||||
&adc ("edx",0);
|
||||
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
|
||||
|
||||
&mov ($j,$_bp); # i
|
||||
&xor ("eax","eax");
|
||||
&mov ($inp,$_ap);
|
||||
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
|
||||
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
|
||||
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
|
||||
&cmp ($j,$num);
|
||||
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
|
||||
&je (&label("common_tail"));
|
||||
|
||||
&mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&mov ("eax",$word);
|
||||
&mov ($_bp,$j); # ++i
|
||||
&mul ($word); # ap[i]*ap[i]
|
||||
&add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
|
||||
&adc ("edx",0);
|
||||
&mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
|
||||
&xor ($carry,$carry);
|
||||
&cmp ($j,$num);
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&je (&label("sqrlast"));
|
||||
|
||||
&mov ($sbit,"edx"); # zaps $num
|
||||
&shr ("edx",1);
|
||||
&and ($sbit,1);
|
||||
&set_label("sqradd",16);
|
||||
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
|
||||
&mov ($carry,"edx");
|
||||
&mul ($word); # ap[j]*ap[i]
|
||||
&add ("eax",$carry);
|
||||
&lea ($carry,&DWP(0,"eax","eax"));
|
||||
&adc ("edx",0);
|
||||
&shr ("eax",31);
|
||||
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
|
||||
&lea ($j,&DWP(1,$j));
|
||||
&adc ("eax",0);
|
||||
&add ($carry,$sbit);
|
||||
&adc ("eax",0);
|
||||
&cmp ($j,$_num);
|
||||
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
|
||||
&mov ($sbit,"eax");
|
||||
&jle (&label("sqradd"));
|
||||
|
||||
&mov ($carry,"edx");
|
||||
&add ("edx","edx");
|
||||
&shr ($carry,31);
|
||||
&add ("edx",$sbit);
|
||||
&adc ($carry,0);
|
||||
&set_label("sqrlast");
|
||||
&mov ($word,$_n0);
|
||||
&mov ($inp,$_np);
|
||||
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
|
||||
|
||||
&add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
|
||||
&mov ("eax",&DWP(0,$inp)); # np[0]
|
||||
&adc ($carry,0);
|
||||
&mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
|
||||
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
|
||||
|
||||
&mul ($word); # np[0]*m
|
||||
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
|
||||
&lea ($num,&DWP(-1,$j));
|
||||
&adc ("edx",0);
|
||||
&mov ($j,1);
|
||||
&mov ("eax",&DWP(4,$inp)); # np[1]
|
||||
|
||||
&jmp (&label("3rdmadd"));
|
||||
}
|
||||
|
||||
&set_label("common_tail",16);
|
||||
&mov ($np,$_np); # load modulus pointer
|
||||
&mov ($rp,$_rp); # load result pointer
|
||||
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
|
||||
|
||||
&mov ("eax",&DWP(0,$tp)); # tp[0]
|
||||
&mov ($j,$num); # j=num-1
|
||||
&xor ($i,$i); # i=0 and clear CF!
|
||||
|
||||
&set_label("sub",16);
|
||||
&sbb ("eax",&DWP(0,$np,$i,4));
|
||||
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
|
||||
&dec ($j); # doesn't affect CF!
|
||||
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
|
||||
&lea ($i,&DWP(1,$i)); # i++
|
||||
&jge (&label("sub"));
|
||||
|
||||
&sbb ("eax",0); # handle upmost overflow bit
|
||||
&and ($tp,"eax");
|
||||
¬ ("eax");
|
||||
&mov ($np,$rp);
|
||||
&and ($np,"eax");
|
||||
&or ($tp,$np); # tp=carry?tp:rp
|
||||
|
||||
&set_label("copy",16); # copy or in-place refresh
|
||||
&mov ("eax",&DWP(0,$tp,$num,4));
|
||||
&mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
|
||||
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
|
||||
&dec ($num);
|
||||
&jge (&label("copy"));
|
||||
|
||||
&mov ("esp",$_sp); # pull saved stack pointer
|
||||
&mov ("eax",1);
|
||||
&set_label("just_leave");
|
||||
&function_end("bn_mul_mont");
|
||||
|
||||
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
|
||||
&asm_finish();
|
||||
28
openssl-1.0.2f/crypto/bn/asm/x86.pl
Normal file
28
openssl-1.0.2f/crypto/bn/asm/x86.pl
Normal file
@@ -0,0 +1,28 @@
|
||||
#!/usr/local/bin/perl
|
||||
|
||||
push(@INC,"perlasm","../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
require("x86/mul_add.pl");
|
||||
require("x86/mul.pl");
|
||||
require("x86/sqr.pl");
|
||||
require("x86/div.pl");
|
||||
require("x86/add.pl");
|
||||
require("x86/sub.pl");
|
||||
require("x86/comba.pl");
|
||||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
&bn_mul_add_words("bn_mul_add_words");
|
||||
&bn_mul_words("bn_mul_words");
|
||||
&bn_sqr_words("bn_sqr_words");
|
||||
&bn_div_words("bn_div_words");
|
||||
&bn_add_words("bn_add_words");
|
||||
&bn_sub_words("bn_sub_words");
|
||||
&bn_mul_comba("bn_mul_comba8",8);
|
||||
&bn_mul_comba("bn_mul_comba4",4);
|
||||
&bn_sqr_comba("bn_sqr_comba8",8);
|
||||
&bn_sqr_comba("bn_sqr_comba4",4);
|
||||
|
||||
&asm_finish();
|
||||
|
||||
76
openssl-1.0.2f/crypto/bn/asm/x86/add.pl
Normal file
76
openssl-1.0.2f/crypto/bn/asm/x86/add.pl
Normal file
@@ -0,0 +1,76 @@
|
||||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_add_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
$c="eax";
|
||||
$r="ebx";
|
||||
$tmp1="ecx";
|
||||
$tmp2="edx";
|
||||
$num="ebp";
|
||||
|
||||
&mov($r,&wparam(0)); # get r
|
||||
&mov($a,&wparam(1)); # get a
|
||||
&mov($b,&wparam(2)); # get b
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&xor($c,$c); # clear carry
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
|
||||
&jz(&label("aw_finish"));
|
||||
|
||||
&set_label("aw_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&add($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&add($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("aw_loop"));
|
||||
|
||||
&set_label("aw_finish",0);
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("aw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
|
||||
&add($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&add($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *a
|
||||
&jz(&label("aw_end")) if ($i != 6);
|
||||
}
|
||||
&set_label("aw_end",0);
|
||||
|
||||
# &mov("eax",$c); # $c is "eax"
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
1;
|
||||
277
openssl-1.0.2f/crypto/bn/asm/x86/comba.pl
Normal file
277
openssl-1.0.2f/crypto/bn/asm/x86/comba.pl
Normal file
@@ -0,0 +1,277 @@
|
||||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub mul_add_c
|
||||
{
|
||||
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("mul a[$ai]*b[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$b,"",0));
|
||||
|
||||
&mul("edx");
|
||||
&add($c0,"eax");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
|
||||
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
|
||||
###
|
||||
&adc($c1,"edx");
|
||||
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
|
||||
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
|
||||
}
|
||||
|
||||
sub sqr_add_c
|
||||
{
|
||||
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("sqr a[$ai]*a[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$b,"",0));
|
||||
|
||||
if ($ai == $bi)
|
||||
{ &mul("eax");}
|
||||
else
|
||||
{ &mul("edx");}
|
||||
&add($c0,"eax");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
|
||||
###
|
||||
&adc($c1,"edx");
|
||||
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
|
||||
###
|
||||
&adc($c2,0);
|
||||
# is pos > 1, it means it is the last loop
|
||||
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
|
||||
}
|
||||
|
||||
sub sqr_add_c2
|
||||
{
|
||||
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
|
||||
|
||||
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
|
||||
# words, and 1 if load return value
|
||||
|
||||
&comment("sqr a[$ai]*a[$bi]");
|
||||
|
||||
# "eax" and "edx" will always be pre-loaded.
|
||||
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
|
||||
# &mov("edx",&DWP($bi*4,$a,"",0));
|
||||
|
||||
if ($ai == $bi)
|
||||
{ &mul("eax");}
|
||||
else
|
||||
{ &mul("edx");}
|
||||
&add("eax","eax");
|
||||
###
|
||||
&adc("edx","edx");
|
||||
###
|
||||
&adc($c2,0);
|
||||
&add($c0,"eax");
|
||||
&adc($c1,"edx");
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
|
||||
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
|
||||
&adc($c2,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
|
||||
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
|
||||
###
|
||||
}
|
||||
|
||||
sub bn_mul_comba
|
||||
{
|
||||
local($name,$num)=@_;
|
||||
local($a,$b,$c0,$c1,$c2);
|
||||
local($i,$as,$ae,$bs,$be,$ai,$bi);
|
||||
local($tot,$end);
|
||||
|
||||
&function_begin_B($name,"");
|
||||
|
||||
$c0="ebx";
|
||||
$c1="ecx";
|
||||
$c2="ebp";
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
|
||||
$as=0;
|
||||
$ae=0;
|
||||
$bs=0;
|
||||
$be=0;
|
||||
$tot=$num+$num-1;
|
||||
|
||||
&push("esi");
|
||||
&mov($a,&wparam(1));
|
||||
&push("edi");
|
||||
&mov($b,&wparam(2));
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
|
||||
&xor($c0,$c0);
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
&xor($c1,$c1);
|
||||
&mov("edx",&DWP(0,$b,"",0)); # load the first second
|
||||
|
||||
for ($i=0; $i<$tot; $i++)
|
||||
{
|
||||
$ai=$as;
|
||||
$bi=$bs;
|
||||
$end=$be+1;
|
||||
|
||||
&comment("################## Calculate word $i");
|
||||
|
||||
for ($j=$bs; $j<$end; $j++)
|
||||
{
|
||||
&xor($c2,$c2) if ($j == $bs);
|
||||
if (($j+1) == $end)
|
||||
{
|
||||
$v=1;
|
||||
$v=2 if (($i+1) == $tot);
|
||||
}
|
||||
else
|
||||
{ $v=0; }
|
||||
if (($j+1) != $end)
|
||||
{
|
||||
$na=($ai-1);
|
||||
$nb=($bi+1);
|
||||
}
|
||||
else
|
||||
{
|
||||
$na=$as+($i < ($num-1));
|
||||
$nb=$bs+($i >= ($num-1));
|
||||
}
|
||||
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
|
||||
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
if ($v)
|
||||
{
|
||||
&comment("saved r[$i]");
|
||||
# &mov("eax",&wparam(0));
|
||||
# &mov(&DWP($i*4,"eax","",0),$c0);
|
||||
($c0,$c1,$c2)=($c1,$c2,$c0);
|
||||
}
|
||||
$ai--;
|
||||
$bi++;
|
||||
}
|
||||
$as++ if ($i < ($num-1));
|
||||
$ae++ if ($i >= ($num-1));
|
||||
|
||||
$bs++ if ($i >= ($num-1));
|
||||
$be++ if ($i < ($num-1));
|
||||
}
|
||||
&comment("save r[$i]");
|
||||
# &mov("eax",&wparam(0));
|
||||
&mov(&DWP($i*4,"eax","",0),$c0);
|
||||
|
||||
&pop("ebx");
|
||||
&pop("ebp");
|
||||
&pop("edi");
|
||||
&pop("esi");
|
||||
&ret();
|
||||
&function_end_B($name);
|
||||
}
|
||||
|
||||
sub bn_sqr_comba
|
||||
{
|
||||
local($name,$num)=@_;
|
||||
local($r,$a,$c0,$c1,$c2)=@_;
|
||||
local($i,$as,$ae,$bs,$be,$ai,$bi);
|
||||
local($b,$tot,$end,$half);
|
||||
|
||||
&function_begin_B($name,"");
|
||||
|
||||
$c0="ebx";
|
||||
$c1="ecx";
|
||||
$c2="ebp";
|
||||
$a="esi";
|
||||
$r="edi";
|
||||
|
||||
&push("esi");
|
||||
&push("edi");
|
||||
&push("ebp");
|
||||
&push("ebx");
|
||||
&mov($r,&wparam(0));
|
||||
&mov($a,&wparam(1));
|
||||
&xor($c0,$c0);
|
||||
&xor($c1,$c1);
|
||||
&mov("eax",&DWP(0,$a,"",0)); # load the first word
|
||||
|
||||
$as=0;
|
||||
$ae=0;
|
||||
$bs=0;
|
||||
$be=0;
|
||||
$tot=$num+$num-1;
|
||||
|
||||
for ($i=0; $i<$tot; $i++)
|
||||
{
|
||||
$ai=$as;
|
||||
$bi=$bs;
|
||||
$end=$be+1;
|
||||
|
||||
&comment("############### Calculate word $i");
|
||||
for ($j=$bs; $j<$end; $j++)
|
||||
{
|
||||
&xor($c2,$c2) if ($j == $bs);
|
||||
if (($ai-1) < ($bi+1))
|
||||
{
|
||||
$v=1;
|
||||
$v=2 if ($i+1) == $tot;
|
||||
}
|
||||
else
|
||||
{ $v=0; }
|
||||
if (!$v)
|
||||
{
|
||||
$na=$ai-1;
|
||||
$nb=$bi+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
$na=$as+($i < ($num-1));
|
||||
$nb=$bs+($i >= ($num-1));
|
||||
}
|
||||
if ($ai == $bi)
|
||||
{
|
||||
&sqr_add_c($r,$a,$ai,$bi,
|
||||
$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
}
|
||||
else
|
||||
{
|
||||
&sqr_add_c2($r,$a,$ai,$bi,
|
||||
$c0,$c1,$c2,$v,$i,$na,$nb);
|
||||
}
|
||||
if ($v)
|
||||
{
|
||||
&comment("saved r[$i]");
|
||||
#&mov(&DWP($i*4,$r,"",0),$c0);
|
||||
($c0,$c1,$c2)=($c1,$c2,$c0);
|
||||
last;
|
||||
}
|
||||
$ai--;
|
||||
$bi++;
|
||||
}
|
||||
$as++ if ($i < ($num-1));
|
||||
$ae++ if ($i >= ($num-1));
|
||||
|
||||
$bs++ if ($i >= ($num-1));
|
||||
$be++ if ($i < ($num-1));
|
||||
}
|
||||
&mov(&DWP($i*4,$r,"",0),$c0);
|
||||
&pop("ebx");
|
||||
&pop("ebp");
|
||||
&pop("edi");
|
||||
&pop("esi");
|
||||
&ret();
|
||||
&function_end_B($name);
|
||||
}
|
||||
|
||||
1;
|
||||
15
openssl-1.0.2f/crypto/bn/asm/x86/div.pl
Normal file
15
openssl-1.0.2f/crypto/bn/asm/x86/div.pl
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_div_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
&mov("edx",&wparam(0)); #
|
||||
&mov("eax",&wparam(1)); #
|
||||
&mov("ebx",&wparam(2)); #
|
||||
&div("ebx");
|
||||
&function_end($name);
|
||||
}
|
||||
1;
|
||||
3
openssl-1.0.2f/crypto/bn/asm/x86/f
Normal file
3
openssl-1.0.2f/crypto/bn/asm/x86/f
Normal file
@@ -0,0 +1,3 @@
|
||||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
77
openssl-1.0.2f/crypto/bn/asm/x86/mul.pl
Normal file
77
openssl-1.0.2f/crypto/bn/asm/x86/mul.pl
Normal file
@@ -0,0 +1,77 @@
|
||||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_mul_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$Low="eax";
|
||||
$High="edx";
|
||||
$a="ebx";
|
||||
$w="ecx";
|
||||
$r="edi";
|
||||
$c="esi";
|
||||
$num="ebp";
|
||||
|
||||
&xor($c,$c); # clear carry
|
||||
&mov($r,&wparam(0)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
&mov($num,&wparam(2)); #
|
||||
&mov($w,&wparam(3)); #
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("mw_finish"));
|
||||
|
||||
&set_label("mw_loop",0);
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov("eax",&DWP($i,$a,"",0)); # *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
# XXX
|
||||
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
|
||||
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jz(&label("mw_finish"));
|
||||
&jmp(&label("mw_loop"));
|
||||
|
||||
&set_label("mw_finish",0);
|
||||
&mov($num,&wparam(2)); # get num
|
||||
&and($num,7);
|
||||
&jnz(&label("mw_finish2"));
|
||||
&jmp(&label("mw_end"));
|
||||
|
||||
&set_label("mw_finish2",1);
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a,"",0));# *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
# XXX
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
&dec($num) if ($i != 7-1);
|
||||
&jz(&label("mw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("mw_end",0);
|
||||
&mov("eax",$c);
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
1;
|
||||
87
openssl-1.0.2f/crypto/bn/asm/x86/mul_add.pl
Normal file
87
openssl-1.0.2f/crypto/bn/asm/x86/mul_add.pl
Normal file
@@ -0,0 +1,87 @@
|
||||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_mul_add_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$Low="eax";
|
||||
$High="edx";
|
||||
$a="ebx";
|
||||
$w="ebp";
|
||||
$r="edi";
|
||||
$c="esi";
|
||||
|
||||
&xor($c,$c); # clear carry
|
||||
&mov($r,&wparam(0)); #
|
||||
|
||||
&mov("ecx",&wparam(2)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
|
||||
&and("ecx",0xfffffff8); # num / 8
|
||||
&mov($w,&wparam(3)); #
|
||||
|
||||
&push("ecx"); # Up the stack for a tmp variable
|
||||
|
||||
&jz(&label("maw_finish"));
|
||||
|
||||
&set_label("maw_loop",0);
|
||||
|
||||
&mov(&swtmp(0),"ecx"); #
|
||||
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov("eax",&DWP($i,$a,"",0)); # *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+= *r
|
||||
&mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&add("eax",$c); # L(t)+=c
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&mov("ecx",&swtmp(0)); #
|
||||
&add($a,32);
|
||||
&add($r,32);
|
||||
&sub("ecx",8);
|
||||
&jnz(&label("maw_loop"));
|
||||
|
||||
&set_label("maw_finish",0);
|
||||
&mov("ecx",&wparam(2)); # get num
|
||||
&and("ecx",7);
|
||||
&jnz(&label("maw_finish2")); # helps branch prediction
|
||||
&jmp(&label("maw_end"));
|
||||
|
||||
&set_label("maw_finish2",1);
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a,"",0));# *a
|
||||
&mul($w); # *a * w
|
||||
&add("eax",$c); # L(t)+=c
|
||||
&mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&add("eax",$c);
|
||||
&adc("edx",0); # H(t)+=carry
|
||||
&dec("ecx") if ($i != 7-1);
|
||||
&mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
|
||||
&mov($c,"edx"); # c= H(t);
|
||||
&jz(&label("maw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("maw_end",0);
|
||||
&mov("eax",$c);
|
||||
|
||||
&pop("ecx"); # clear variable from
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
1;
|
||||
60
openssl-1.0.2f/crypto/bn/asm/x86/sqr.pl
Normal file
60
openssl-1.0.2f/crypto/bn/asm/x86/sqr.pl
Normal file
@@ -0,0 +1,60 @@
|
||||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_sqr_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$r="esi";
|
||||
$a="edi";
|
||||
$num="ebx";
|
||||
|
||||
&mov($r,&wparam(0)); #
|
||||
&mov($a,&wparam(1)); #
|
||||
&mov($num,&wparam(2)); #
|
||||
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
&jz(&label("sw_finish"));
|
||||
|
||||
&set_label("sw_loop",0);
|
||||
for ($i=0; $i<32; $i+=4)
|
||||
{
|
||||
&comment("Round $i");
|
||||
&mov("eax",&DWP($i,$a,"",0)); # *a
|
||||
# XXX
|
||||
&mul("eax"); # *a * *a
|
||||
&mov(&DWP($i*2,$r,"",0),"eax"); #
|
||||
&mov(&DWP($i*2+4,$r,"",0),"edx");#
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($r,64);
|
||||
&sub($num,8);
|
||||
&jnz(&label("sw_loop"));
|
||||
|
||||
&set_label("sw_finish",0);
|
||||
&mov($num,&wparam(2)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("sw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov("eax",&DWP($i*4,$a,"",0)); # *a
|
||||
# XXX
|
||||
&mul("eax"); # *a * *a
|
||||
&mov(&DWP($i*8,$r,"",0),"eax"); #
|
||||
&dec($num) if ($i != 7-1);
|
||||
&mov(&DWP($i*8+4,$r,"",0),"edx");
|
||||
&jz(&label("sw_end")) if ($i != 7-1);
|
||||
}
|
||||
&set_label("sw_end",0);
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
1;
|
||||
76
openssl-1.0.2f/crypto/bn/asm/x86/sub.pl
Normal file
76
openssl-1.0.2f/crypto/bn/asm/x86/sub.pl
Normal file
@@ -0,0 +1,76 @@
|
||||
#!/usr/local/bin/perl
|
||||
# x86 assember
|
||||
|
||||
sub bn_sub_words
|
||||
{
|
||||
local($name)=@_;
|
||||
|
||||
&function_begin($name,"");
|
||||
|
||||
&comment("");
|
||||
$a="esi";
|
||||
$b="edi";
|
||||
$c="eax";
|
||||
$r="ebx";
|
||||
$tmp1="ecx";
|
||||
$tmp2="edx";
|
||||
$num="ebp";
|
||||
|
||||
&mov($r,&wparam(0)); # get r
|
||||
&mov($a,&wparam(1)); # get a
|
||||
&mov($b,&wparam(2)); # get b
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&xor($c,$c); # clear carry
|
||||
&and($num,0xfffffff8); # num / 8
|
||||
|
||||
&jz(&label("aw_finish"));
|
||||
|
||||
&set_label("aw_loop",0);
|
||||
for ($i=0; $i<8; $i++)
|
||||
{
|
||||
&comment("Round $i");
|
||||
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
|
||||
}
|
||||
|
||||
&comment("");
|
||||
&add($a,32);
|
||||
&add($b,32);
|
||||
&add($r,32);
|
||||
&sub($num,8);
|
||||
&jnz(&label("aw_loop"));
|
||||
|
||||
&set_label("aw_finish",0);
|
||||
&mov($num,&wparam(3)); # get num
|
||||
&and($num,7);
|
||||
&jz(&label("aw_end"));
|
||||
|
||||
for ($i=0; $i<7; $i++)
|
||||
{
|
||||
&comment("Tail Round $i");
|
||||
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
|
||||
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
|
||||
&sub($tmp1,$c);
|
||||
&mov($c,0);
|
||||
&adc($c,$c);
|
||||
&sub($tmp1,$tmp2);
|
||||
&adc($c,0);
|
||||
&dec($num) if ($i != 6);
|
||||
&mov(&DWP($i*4,$r,"",0),$tmp1); # *a
|
||||
&jz(&label("aw_end")) if ($i != 6);
|
||||
}
|
||||
&set_label("aw_end",0);
|
||||
|
||||
# &mov("eax",$c); # $c is "eax"
|
||||
|
||||
&function_end($name);
|
||||
}
|
||||
|
||||
1;
|
||||
638
openssl-1.0.2f/crypto/bn/asm/x86_64-gcc.c
Normal file
638
openssl-1.0.2f/crypto/bn/asm/x86_64-gcc.c
Normal file
@@ -0,0 +1,638 @@
|
||||
#include "../bn_lcl.h"
|
||||
#if !(defined(__GNUC__) && __GNUC__>=2)
|
||||
# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
|
||||
#else
|
||||
/*-
|
||||
* x86_64 BIGNUM accelerator version 0.1, December 2002.
|
||||
*
|
||||
* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
* project.
|
||||
*
|
||||
* Rights for redistribution and usage in source and binary forms are
|
||||
* granted according to the OpenSSL license. Warranty of any kind is
|
||||
* disclaimed.
|
||||
*
|
||||
* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
|
||||
* versions, like 1.0...
|
||||
* A. Well, that's because this code is basically a quick-n-dirty
|
||||
* proof-of-concept hack. As you can see it's implemented with
|
||||
* inline assembler, which means that you're bound to GCC and that
|
||||
* there might be enough room for further improvement.
|
||||
*
|
||||
* Q. Why inline assembler?
|
||||
* A. x86_64 features own ABI which I'm not familiar with. This is
|
||||
* why I decided to let the compiler take care of subroutine
|
||||
* prologue/epilogue as well as register allocation. For reference.
|
||||
* Win64 implements different ABI for AMD64, different from Linux.
|
||||
*
|
||||
* Q. How much faster does it get?
|
||||
* A. 'apps/openssl speed rsa dsa' output with no-asm:
|
||||
*
|
||||
* sign verify sign/s verify/s
|
||||
* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
|
||||
* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
|
||||
* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
|
||||
* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
|
||||
* sign verify sign/s verify/s
|
||||
* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
|
||||
* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
|
||||
* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
|
||||
*
|
||||
* 'apps/openssl speed rsa dsa' output with this module:
|
||||
*
|
||||
* sign verify sign/s verify/s
|
||||
* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
|
||||
* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
|
||||
* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
|
||||
* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
|
||||
* sign verify sign/s verify/s
|
||||
* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
|
||||
* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
|
||||
* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
|
||||
*
|
||||
* For the reference. IA-32 assembler implementation performs
|
||||
* very much like 64-bit code compiled with no-asm on the same
|
||||
* machine.
|
||||
*/
|
||||
|
||||
# if defined(_WIN64) || !defined(__LP64__)
|
||||
# define BN_ULONG unsigned long long
|
||||
# else
|
||||
# define BN_ULONG unsigned long
|
||||
# endif
|
||||
|
||||
# undef mul
|
||||
# undef mul_add
|
||||
|
||||
/*-
|
||||
* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
|
||||
* "g"(0) let the compiler to decide where does it
|
||||
* want to keep the value of zero;
|
||||
*/
|
||||
# define mul_add(r,a,word,carry) do { \
|
||||
register BN_ULONG high,low; \
|
||||
asm ("mulq %3" \
|
||||
: "=a"(low),"=d"(high) \
|
||||
: "a"(word),"m"(a) \
|
||||
: "cc"); \
|
||||
asm ("addq %2,%0; adcq %3,%1" \
|
||||
: "+r"(carry),"+d"(high)\
|
||||
: "a"(low),"g"(0) \
|
||||
: "cc"); \
|
||||
asm ("addq %2,%0; adcq %3,%1" \
|
||||
: "+m"(r),"+d"(high) \
|
||||
: "r"(carry),"g"(0) \
|
||||
: "cc"); \
|
||||
carry=high; \
|
||||
} while (0)
|
||||
|
||||
# define mul(r,a,word,carry) do { \
|
||||
register BN_ULONG high,low; \
|
||||
asm ("mulq %3" \
|
||||
: "=a"(low),"=d"(high) \
|
||||
: "a"(word),"g"(a) \
|
||||
: "cc"); \
|
||||
asm ("addq %2,%0; adcq %3,%1" \
|
||||
: "+r"(carry),"+d"(high)\
|
||||
: "a"(low),"g"(0) \
|
||||
: "cc"); \
|
||||
(r)=carry, carry=high; \
|
||||
} while (0)
|
||||
# undef sqr
|
||||
# define sqr(r0,r1,a) \
|
||||
asm ("mulq %2" \
|
||||
: "=a"(r0),"=d"(r1) \
|
||||
: "a"(a) \
|
||||
: "cc");
|
||||
|
||||
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
|
||||
BN_ULONG w)
|
||||
{
|
||||
BN_ULONG c1 = 0;
|
||||
|
||||
if (num <= 0)
|
||||
return (c1);
|
||||
|
||||
while (num & ~3) {
|
||||
mul_add(rp[0], ap[0], w, c1);
|
||||
mul_add(rp[1], ap[1], w, c1);
|
||||
mul_add(rp[2], ap[2], w, c1);
|
||||
mul_add(rp[3], ap[3], w, c1);
|
||||
ap += 4;
|
||||
rp += 4;
|
||||
num -= 4;
|
||||
}
|
||||
if (num) {
|
||||
mul_add(rp[0], ap[0], w, c1);
|
||||
if (--num == 0)
|
||||
return c1;
|
||||
mul_add(rp[1], ap[1], w, c1);
|
||||
if (--num == 0)
|
||||
return c1;
|
||||
mul_add(rp[2], ap[2], w, c1);
|
||||
return c1;
|
||||
}
|
||||
|
||||
return (c1);
|
||||
}
|
||||
|
||||
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
|
||||
{
|
||||
BN_ULONG c1 = 0;
|
||||
|
||||
if (num <= 0)
|
||||
return (c1);
|
||||
|
||||
while (num & ~3) {
|
||||
mul(rp[0], ap[0], w, c1);
|
||||
mul(rp[1], ap[1], w, c1);
|
||||
mul(rp[2], ap[2], w, c1);
|
||||
mul(rp[3], ap[3], w, c1);
|
||||
ap += 4;
|
||||
rp += 4;
|
||||
num -= 4;
|
||||
}
|
||||
if (num) {
|
||||
mul(rp[0], ap[0], w, c1);
|
||||
if (--num == 0)
|
||||
return c1;
|
||||
mul(rp[1], ap[1], w, c1);
|
||||
if (--num == 0)
|
||||
return c1;
|
||||
mul(rp[2], ap[2], w, c1);
|
||||
}
|
||||
return (c1);
|
||||
}
|
||||
|
||||
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
|
||||
{
|
||||
if (n <= 0)
|
||||
return;
|
||||
|
||||
while (n & ~3) {
|
||||
sqr(r[0], r[1], a[0]);
|
||||
sqr(r[2], r[3], a[1]);
|
||||
sqr(r[4], r[5], a[2]);
|
||||
sqr(r[6], r[7], a[3]);
|
||||
a += 4;
|
||||
r += 8;
|
||||
n -= 4;
|
||||
}
|
||||
if (n) {
|
||||
sqr(r[0], r[1], a[0]);
|
||||
if (--n == 0)
|
||||
return;
|
||||
sqr(r[2], r[3], a[1]);
|
||||
if (--n == 0)
|
||||
return;
|
||||
sqr(r[4], r[5], a[2]);
|
||||
}
|
||||
}
|
||||
|
||||
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
|
||||
{
|
||||
BN_ULONG ret, waste;
|
||||
|
||||
asm("divq %4":"=a"(ret), "=d"(waste)
|
||||
: "a"(l), "d"(h), "g"(d)
|
||||
: "cc");
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
||||
int n)
|
||||
{
|
||||
BN_ULONG ret;
|
||||
size_t i = 0;
|
||||
|
||||
if (n <= 0)
|
||||
return 0;
|
||||
|
||||
asm volatile (" subq %0,%0 \n" /* clear carry */
|
||||
" jmp 1f \n"
|
||||
".p2align 4 \n"
|
||||
"1: movq (%4,%2,8),%0 \n"
|
||||
" adcq (%5,%2,8),%0 \n"
|
||||
" movq %0,(%3,%2,8) \n"
|
||||
" lea 1(%2),%2 \n"
|
||||
" loop 1b \n"
|
||||
" sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
|
||||
"+r"(i)
|
||||
:"r"(rp), "r"(ap), "r"(bp)
|
||||
:"cc", "memory");
|
||||
|
||||
return ret & 1;
|
||||
}
|
||||
|
||||
# ifndef SIMICS
|
||||
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
||||
int n)
|
||||
{
|
||||
BN_ULONG ret;
|
||||
size_t i = 0;
|
||||
|
||||
if (n <= 0)
|
||||
return 0;
|
||||
|
||||
asm volatile (" subq %0,%0 \n" /* clear borrow */
|
||||
" jmp 1f \n"
|
||||
".p2align 4 \n"
|
||||
"1: movq (%4,%2,8),%0 \n"
|
||||
" sbbq (%5,%2,8),%0 \n"
|
||||
" movq %0,(%3,%2,8) \n"
|
||||
" lea 1(%2),%2 \n"
|
||||
" loop 1b \n"
|
||||
" sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
|
||||
"+r"(i)
|
||||
:"r"(rp), "r"(ap), "r"(bp)
|
||||
:"cc", "memory");
|
||||
|
||||
return ret & 1;
|
||||
}
|
||||
# else
|
||||
/* Simics 1.4<7 has buggy sbbq:-( */
|
||||
# define BN_MASK2 0xffffffffffffffffL
|
||||
BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
|
||||
{
|
||||
BN_ULONG t1, t2;
|
||||
int c = 0;
|
||||
|
||||
if (n <= 0)
|
||||
return ((BN_ULONG)0);
|
||||
|
||||
for (;;) {
|
||||
t1 = a[0];
|
||||
t2 = b[0];
|
||||
r[0] = (t1 - t2 - c) & BN_MASK2;
|
||||
if (t1 != t2)
|
||||
c = (t1 < t2);
|
||||
if (--n <= 0)
|
||||
break;
|
||||
|
||||
t1 = a[1];
|
||||
t2 = b[1];
|
||||
r[1] = (t1 - t2 - c) & BN_MASK2;
|
||||
if (t1 != t2)
|
||||
c = (t1 < t2);
|
||||
if (--n <= 0)
|
||||
break;
|
||||
|
||||
t1 = a[2];
|
||||
t2 = b[2];
|
||||
r[2] = (t1 - t2 - c) & BN_MASK2;
|
||||
if (t1 != t2)
|
||||
c = (t1 < t2);
|
||||
if (--n <= 0)
|
||||
break;
|
||||
|
||||
t1 = a[3];
|
||||
t2 = b[3];
|
||||
r[3] = (t1 - t2 - c) & BN_MASK2;
|
||||
if (t1 != t2)
|
||||
c = (t1 < t2);
|
||||
if (--n <= 0)
|
||||
break;
|
||||
|
||||
a += 4;
|
||||
b += 4;
|
||||
r += 4;
|
||||
}
|
||||
return (c);
|
||||
}
|
||||
# endif
|
||||
|
||||
/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
|
||||
/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
|
||||
/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
|
||||
/*
|
||||
* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
|
||||
* c=(c2,c1,c0)
|
||||
*/
|
||||
|
||||
/*
|
||||
* Keep in mind that carrying into high part of multiplication result
|
||||
* can not overflow, because it cannot be all-ones.
|
||||
*/
|
||||
# if 0
|
||||
/* original macros are kept for reference purposes */
|
||||
# define mul_add_c(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a), tb = (b); \
|
||||
BN_ULONG lo, hi; \
|
||||
BN_UMULT_LOHI(lo,hi,ta,tb); \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
|
||||
# define mul_add_c2(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a), tb = (b); \
|
||||
BN_ULONG lo, hi, tt; \
|
||||
BN_UMULT_LOHI(lo,hi,ta,tb); \
|
||||
c0 += lo; tt = hi+((c0<lo)?1:0); \
|
||||
c1 += tt; c2 += (c1<tt)?1:0; \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
|
||||
# define sqr_add_c(a,i,c0,c1,c2) do { \
|
||||
BN_ULONG ta = (a)[i]; \
|
||||
BN_ULONG lo, hi; \
|
||||
BN_UMULT_LOHI(lo,hi,ta,ta); \
|
||||
c0 += lo; hi += (c0<lo)?1:0; \
|
||||
c1 += hi; c2 += (c1<hi)?1:0; \
|
||||
} while(0)
|
||||
# else
|
||||
# define mul_add_c(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG t1,t2; \
|
||||
asm ("mulq %3" \
|
||||
: "=a"(t1),"=d"(t2) \
|
||||
: "a"(a),"m"(b) \
|
||||
: "cc"); \
|
||||
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
|
||||
: "+r"(c0),"+r"(c1),"+r"(c2) \
|
||||
: "r"(t1),"r"(t2),"g"(0) \
|
||||
: "cc"); \
|
||||
} while (0)
|
||||
|
||||
# define sqr_add_c(a,i,c0,c1,c2) do { \
|
||||
BN_ULONG t1,t2; \
|
||||
asm ("mulq %2" \
|
||||
: "=a"(t1),"=d"(t2) \
|
||||
: "a"(a[i]) \
|
||||
: "cc"); \
|
||||
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
|
||||
: "+r"(c0),"+r"(c1),"+r"(c2) \
|
||||
: "r"(t1),"r"(t2),"g"(0) \
|
||||
: "cc"); \
|
||||
} while (0)
|
||||
|
||||
# define mul_add_c2(a,b,c0,c1,c2) do { \
|
||||
BN_ULONG t1,t2; \
|
||||
asm ("mulq %3" \
|
||||
: "=a"(t1),"=d"(t2) \
|
||||
: "a"(a),"m"(b) \
|
||||
: "cc"); \
|
||||
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
|
||||
: "+r"(c0),"+r"(c1),"+r"(c2) \
|
||||
: "r"(t1),"r"(t2),"g"(0) \
|
||||
: "cc"); \
|
||||
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
|
||||
: "+r"(c0),"+r"(c1),"+r"(c2) \
|
||||
: "r"(t1),"r"(t2),"g"(0) \
|
||||
: "cc"); \
|
||||
} while (0)
|
||||
# endif
|
||||
|
||||
# define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||
|
||||
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
||||
{
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
c2 = 0;
|
||||
c3 = 0;
|
||||
mul_add_c(a[0], b[0], c1, c2, c3);
|
||||
r[0] = c1;
|
||||
c1 = 0;
|
||||
mul_add_c(a[0], b[1], c2, c3, c1);
|
||||
mul_add_c(a[1], b[0], c2, c3, c1);
|
||||
r[1] = c2;
|
||||
c2 = 0;
|
||||
mul_add_c(a[2], b[0], c3, c1, c2);
|
||||
mul_add_c(a[1], b[1], c3, c1, c2);
|
||||
mul_add_c(a[0], b[2], c3, c1, c2);
|
||||
r[2] = c3;
|
||||
c3 = 0;
|
||||
mul_add_c(a[0], b[3], c1, c2, c3);
|
||||
mul_add_c(a[1], b[2], c1, c2, c3);
|
||||
mul_add_c(a[2], b[1], c1, c2, c3);
|
||||
mul_add_c(a[3], b[0], c1, c2, c3);
|
||||
r[3] = c1;
|
||||
c1 = 0;
|
||||
mul_add_c(a[4], b[0], c2, c3, c1);
|
||||
mul_add_c(a[3], b[1], c2, c3, c1);
|
||||
mul_add_c(a[2], b[2], c2, c3, c1);
|
||||
mul_add_c(a[1], b[3], c2, c3, c1);
|
||||
mul_add_c(a[0], b[4], c2, c3, c1);
|
||||
r[4] = c2;
|
||||
c2 = 0;
|
||||
mul_add_c(a[0], b[5], c3, c1, c2);
|
||||
mul_add_c(a[1], b[4], c3, c1, c2);
|
||||
mul_add_c(a[2], b[3], c3, c1, c2);
|
||||
mul_add_c(a[3], b[2], c3, c1, c2);
|
||||
mul_add_c(a[4], b[1], c3, c1, c2);
|
||||
mul_add_c(a[5], b[0], c3, c1, c2);
|
||||
r[5] = c3;
|
||||
c3 = 0;
|
||||
mul_add_c(a[6], b[0], c1, c2, c3);
|
||||
mul_add_c(a[5], b[1], c1, c2, c3);
|
||||
mul_add_c(a[4], b[2], c1, c2, c3);
|
||||
mul_add_c(a[3], b[3], c1, c2, c3);
|
||||
mul_add_c(a[2], b[4], c1, c2, c3);
|
||||
mul_add_c(a[1], b[5], c1, c2, c3);
|
||||
mul_add_c(a[0], b[6], c1, c2, c3);
|
||||
r[6] = c1;
|
||||
c1 = 0;
|
||||
mul_add_c(a[0], b[7], c2, c3, c1);
|
||||
mul_add_c(a[1], b[6], c2, c3, c1);
|
||||
mul_add_c(a[2], b[5], c2, c3, c1);
|
||||
mul_add_c(a[3], b[4], c2, c3, c1);
|
||||
mul_add_c(a[4], b[3], c2, c3, c1);
|
||||
mul_add_c(a[5], b[2], c2, c3, c1);
|
||||
mul_add_c(a[6], b[1], c2, c3, c1);
|
||||
mul_add_c(a[7], b[0], c2, c3, c1);
|
||||
r[7] = c2;
|
||||
c2 = 0;
|
||||
mul_add_c(a[7], b[1], c3, c1, c2);
|
||||
mul_add_c(a[6], b[2], c3, c1, c2);
|
||||
mul_add_c(a[5], b[3], c3, c1, c2);
|
||||
mul_add_c(a[4], b[4], c3, c1, c2);
|
||||
mul_add_c(a[3], b[5], c3, c1, c2);
|
||||
mul_add_c(a[2], b[6], c3, c1, c2);
|
||||
mul_add_c(a[1], b[7], c3, c1, c2);
|
||||
r[8] = c3;
|
||||
c3 = 0;
|
||||
mul_add_c(a[2], b[7], c1, c2, c3);
|
||||
mul_add_c(a[3], b[6], c1, c2, c3);
|
||||
mul_add_c(a[4], b[5], c1, c2, c3);
|
||||
mul_add_c(a[5], b[4], c1, c2, c3);
|
||||
mul_add_c(a[6], b[3], c1, c2, c3);
|
||||
mul_add_c(a[7], b[2], c1, c2, c3);
|
||||
r[9] = c1;
|
||||
c1 = 0;
|
||||
mul_add_c(a[7], b[3], c2, c3, c1);
|
||||
mul_add_c(a[6], b[4], c2, c3, c1);
|
||||
mul_add_c(a[5], b[5], c2, c3, c1);
|
||||
mul_add_c(a[4], b[6], c2, c3, c1);
|
||||
mul_add_c(a[3], b[7], c2, c3, c1);
|
||||
r[10] = c2;
|
||||
c2 = 0;
|
||||
mul_add_c(a[4], b[7], c3, c1, c2);
|
||||
mul_add_c(a[5], b[6], c3, c1, c2);
|
||||
mul_add_c(a[6], b[5], c3, c1, c2);
|
||||
mul_add_c(a[7], b[4], c3, c1, c2);
|
||||
r[11] = c3;
|
||||
c3 = 0;
|
||||
mul_add_c(a[7], b[5], c1, c2, c3);
|
||||
mul_add_c(a[6], b[6], c1, c2, c3);
|
||||
mul_add_c(a[5], b[7], c1, c2, c3);
|
||||
r[12] = c1;
|
||||
c1 = 0;
|
||||
mul_add_c(a[6], b[7], c2, c3, c1);
|
||||
mul_add_c(a[7], b[6], c2, c3, c1);
|
||||
r[13] = c2;
|
||||
c2 = 0;
|
||||
mul_add_c(a[7], b[7], c3, c1, c2);
|
||||
r[14] = c3;
|
||||
r[15] = c1;
|
||||
}
|
||||
|
||||
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
|
||||
{
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
c2 = 0;
|
||||
c3 = 0;
|
||||
mul_add_c(a[0], b[0], c1, c2, c3);
|
||||
r[0] = c1;
|
||||
c1 = 0;
|
||||
mul_add_c(a[0], b[1], c2, c3, c1);
|
||||
mul_add_c(a[1], b[0], c2, c3, c1);
|
||||
r[1] = c2;
|
||||
c2 = 0;
|
||||
mul_add_c(a[2], b[0], c3, c1, c2);
|
||||
mul_add_c(a[1], b[1], c3, c1, c2);
|
||||
mul_add_c(a[0], b[2], c3, c1, c2);
|
||||
r[2] = c3;
|
||||
c3 = 0;
|
||||
mul_add_c(a[0], b[3], c1, c2, c3);
|
||||
mul_add_c(a[1], b[2], c1, c2, c3);
|
||||
mul_add_c(a[2], b[1], c1, c2, c3);
|
||||
mul_add_c(a[3], b[0], c1, c2, c3);
|
||||
r[3] = c1;
|
||||
c1 = 0;
|
||||
mul_add_c(a[3], b[1], c2, c3, c1);
|
||||
mul_add_c(a[2], b[2], c2, c3, c1);
|
||||
mul_add_c(a[1], b[3], c2, c3, c1);
|
||||
r[4] = c2;
|
||||
c2 = 0;
|
||||
mul_add_c(a[2], b[3], c3, c1, c2);
|
||||
mul_add_c(a[3], b[2], c3, c1, c2);
|
||||
r[5] = c3;
|
||||
c3 = 0;
|
||||
mul_add_c(a[3], b[3], c1, c2, c3);
|
||||
r[6] = c1;
|
||||
r[7] = c2;
|
||||
}
|
||||
|
||||
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
|
||||
{
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
c2 = 0;
|
||||
c3 = 0;
|
||||
sqr_add_c(a, 0, c1, c2, c3);
|
||||
r[0] = c1;
|
||||
c1 = 0;
|
||||
sqr_add_c2(a, 1, 0, c2, c3, c1);
|
||||
r[1] = c2;
|
||||
c2 = 0;
|
||||
sqr_add_c(a, 1, c3, c1, c2);
|
||||
sqr_add_c2(a, 2, 0, c3, c1, c2);
|
||||
r[2] = c3;
|
||||
c3 = 0;
|
||||
sqr_add_c2(a, 3, 0, c1, c2, c3);
|
||||
sqr_add_c2(a, 2, 1, c1, c2, c3);
|
||||
r[3] = c1;
|
||||
c1 = 0;
|
||||
sqr_add_c(a, 2, c2, c3, c1);
|
||||
sqr_add_c2(a, 3, 1, c2, c3, c1);
|
||||
sqr_add_c2(a, 4, 0, c2, c3, c1);
|
||||
r[4] = c2;
|
||||
c2 = 0;
|
||||
sqr_add_c2(a, 5, 0, c3, c1, c2);
|
||||
sqr_add_c2(a, 4, 1, c3, c1, c2);
|
||||
sqr_add_c2(a, 3, 2, c3, c1, c2);
|
||||
r[5] = c3;
|
||||
c3 = 0;
|
||||
sqr_add_c(a, 3, c1, c2, c3);
|
||||
sqr_add_c2(a, 4, 2, c1, c2, c3);
|
||||
sqr_add_c2(a, 5, 1, c1, c2, c3);
|
||||
sqr_add_c2(a, 6, 0, c1, c2, c3);
|
||||
r[6] = c1;
|
||||
c1 = 0;
|
||||
sqr_add_c2(a, 7, 0, c2, c3, c1);
|
||||
sqr_add_c2(a, 6, 1, c2, c3, c1);
|
||||
sqr_add_c2(a, 5, 2, c2, c3, c1);
|
||||
sqr_add_c2(a, 4, 3, c2, c3, c1);
|
||||
r[7] = c2;
|
||||
c2 = 0;
|
||||
sqr_add_c(a, 4, c3, c1, c2);
|
||||
sqr_add_c2(a, 5, 3, c3, c1, c2);
|
||||
sqr_add_c2(a, 6, 2, c3, c1, c2);
|
||||
sqr_add_c2(a, 7, 1, c3, c1, c2);
|
||||
r[8] = c3;
|
||||
c3 = 0;
|
||||
sqr_add_c2(a, 7, 2, c1, c2, c3);
|
||||
sqr_add_c2(a, 6, 3, c1, c2, c3);
|
||||
sqr_add_c2(a, 5, 4, c1, c2, c3);
|
||||
r[9] = c1;
|
||||
c1 = 0;
|
||||
sqr_add_c(a, 5, c2, c3, c1);
|
||||
sqr_add_c2(a, 6, 4, c2, c3, c1);
|
||||
sqr_add_c2(a, 7, 3, c2, c3, c1);
|
||||
r[10] = c2;
|
||||
c2 = 0;
|
||||
sqr_add_c2(a, 7, 4, c3, c1, c2);
|
||||
sqr_add_c2(a, 6, 5, c3, c1, c2);
|
||||
r[11] = c3;
|
||||
c3 = 0;
|
||||
sqr_add_c(a, 6, c1, c2, c3);
|
||||
sqr_add_c2(a, 7, 5, c1, c2, c3);
|
||||
r[12] = c1;
|
||||
c1 = 0;
|
||||
sqr_add_c2(a, 7, 6, c2, c3, c1);
|
||||
r[13] = c2;
|
||||
c2 = 0;
|
||||
sqr_add_c(a, 7, c3, c1, c2);
|
||||
r[14] = c3;
|
||||
r[15] = c1;
|
||||
}
|
||||
|
||||
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
|
||||
{
|
||||
BN_ULONG c1, c2, c3;
|
||||
|
||||
c1 = 0;
|
||||
c2 = 0;
|
||||
c3 = 0;
|
||||
sqr_add_c(a, 0, c1, c2, c3);
|
||||
r[0] = c1;
|
||||
c1 = 0;
|
||||
sqr_add_c2(a, 1, 0, c2, c3, c1);
|
||||
r[1] = c2;
|
||||
c2 = 0;
|
||||
sqr_add_c(a, 1, c3, c1, c2);
|
||||
sqr_add_c2(a, 2, 0, c3, c1, c2);
|
||||
r[2] = c3;
|
||||
c3 = 0;
|
||||
sqr_add_c2(a, 3, 0, c1, c2, c3);
|
||||
sqr_add_c2(a, 2, 1, c1, c2, c3);
|
||||
r[3] = c1;
|
||||
c1 = 0;
|
||||
sqr_add_c(a, 2, c2, c3, c1);
|
||||
sqr_add_c2(a, 3, 1, c2, c3, c1);
|
||||
r[4] = c2;
|
||||
c2 = 0;
|
||||
sqr_add_c2(a, 3, 2, c3, c1, c2);
|
||||
r[5] = c3;
|
||||
c3 = 0;
|
||||
sqr_add_c(a, 3, c1, c2, c3);
|
||||
r[6] = c1;
|
||||
r[7] = c2;
|
||||
}
|
||||
#endif
|
||||
390
openssl-1.0.2f/crypto/bn/asm/x86_64-gf2m.pl
Normal file
390
openssl-1.0.2f/crypto/bn/asm/x86_64-gf2m.pl
Normal file
@@ -0,0 +1,390 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# May 2011
|
||||
#
|
||||
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
|
||||
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
|
||||
# the time being... Except that it has two code paths: code suitable
|
||||
# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
|
||||
# later. Improvement varies from one benchmark and µ-arch to another.
|
||||
# Vanilla code path is at most 20% faster than compiler-generated code
|
||||
# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
|
||||
# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
|
||||
# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
|
||||
# all CPU time is burnt in it...
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
($lo,$hi)=("%rax","%rdx"); $a=$lo;
|
||||
($i0,$i1)=("%rsi","%rdi");
|
||||
($t0,$t1)=("%rbx","%rcx");
|
||||
($b,$mask)=("%rbp","%r8");
|
||||
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
|
||||
($R,$Tx)=("%xmm0","%xmm1");
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.type _mul_1x1,\@abi-omnipotent
|
||||
.align 16
|
||||
_mul_1x1:
|
||||
sub \$128+8,%rsp
|
||||
mov \$-1,$a1
|
||||
lea ($a,$a),$i0
|
||||
shr \$3,$a1
|
||||
lea (,$a,4),$i1
|
||||
and $a,$a1 # a1=a&0x1fffffffffffffff
|
||||
lea (,$a,8),$a8
|
||||
sar \$63,$a # broadcast 63rd bit
|
||||
lea ($a1,$a1),$a2
|
||||
sar \$63,$i0 # broadcast 62nd bit
|
||||
lea (,$a1,4),$a4
|
||||
and $b,$a
|
||||
sar \$63,$i1 # boardcast 61st bit
|
||||
mov $a,$hi # $a is $lo
|
||||
shl \$63,$lo
|
||||
and $b,$i0
|
||||
shr \$1,$hi
|
||||
mov $i0,$t1
|
||||
shl \$62,$i0
|
||||
and $b,$i1
|
||||
shr \$2,$t1
|
||||
xor $i0,$lo
|
||||
mov $i1,$t0
|
||||
shl \$61,$i1
|
||||
xor $t1,$hi
|
||||
shr \$3,$t0
|
||||
xor $i1,$lo
|
||||
xor $t0,$hi
|
||||
|
||||
mov $a1,$a12
|
||||
movq \$0,0(%rsp) # tab[0]=0
|
||||
xor $a2,$a12 # a1^a2
|
||||
mov $a1,8(%rsp) # tab[1]=a1
|
||||
mov $a4,$a48
|
||||
mov $a2,16(%rsp) # tab[2]=a2
|
||||
xor $a8,$a48 # a4^a8
|
||||
mov $a12,24(%rsp) # tab[3]=a1^a2
|
||||
|
||||
xor $a4,$a1
|
||||
mov $a4,32(%rsp) # tab[4]=a4
|
||||
xor $a4,$a2
|
||||
mov $a1,40(%rsp) # tab[5]=a1^a4
|
||||
xor $a4,$a12
|
||||
mov $a2,48(%rsp) # tab[6]=a2^a4
|
||||
xor $a48,$a1 # a1^a4^a4^a8=a1^a8
|
||||
mov $a12,56(%rsp) # tab[7]=a1^a2^a4
|
||||
xor $a48,$a2 # a2^a4^a4^a8=a1^a8
|
||||
|
||||
mov $a8,64(%rsp) # tab[8]=a8
|
||||
xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
|
||||
mov $a1,72(%rsp) # tab[9]=a1^a8
|
||||
xor $a4,$a1 # a1^a8^a4
|
||||
mov $a2,80(%rsp) # tab[10]=a2^a8
|
||||
xor $a4,$a2 # a2^a8^a4
|
||||
mov $a12,88(%rsp) # tab[11]=a1^a2^a8
|
||||
|
||||
xor $a4,$a12 # a1^a2^a8^a4
|
||||
mov $a48,96(%rsp) # tab[12]=a4^a8
|
||||
mov $mask,$i0
|
||||
mov $a1,104(%rsp) # tab[13]=a1^a4^a8
|
||||
and $b,$i0
|
||||
mov $a2,112(%rsp) # tab[14]=a2^a4^a8
|
||||
shr \$4,$b
|
||||
mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
|
||||
mov $mask,$i1
|
||||
and $b,$i1
|
||||
shr \$4,$b
|
||||
|
||||
movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
|
||||
mov $mask,$i0
|
||||
and $b,$i0
|
||||
shr \$4,$b
|
||||
___
|
||||
for ($n=1;$n<8;$n++) {
|
||||
$code.=<<___;
|
||||
mov (%rsp,$i1,8),$t1
|
||||
mov $mask,$i1
|
||||
mov $t1,$t0
|
||||
shl \$`8*$n-4`,$t1
|
||||
and $b,$i1
|
||||
movq (%rsp,$i0,8),$Tx
|
||||
shr \$`64-(8*$n-4)`,$t0
|
||||
xor $t1,$lo
|
||||
pslldq \$$n,$Tx
|
||||
mov $mask,$i0
|
||||
shr \$4,$b
|
||||
xor $t0,$hi
|
||||
and $b,$i0
|
||||
shr \$4,$b
|
||||
pxor $Tx,$R
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
mov (%rsp,$i1,8),$t1
|
||||
mov $t1,$t0
|
||||
shl \$`8*$n-4`,$t1
|
||||
movq $R,$i0
|
||||
shr \$`64-(8*$n-4)`,$t0
|
||||
xor $t1,$lo
|
||||
psrldq \$8,$R
|
||||
xor $t0,$hi
|
||||
movq $R,$i1
|
||||
xor $i0,$lo
|
||||
xor $i1,$hi
|
||||
|
||||
add \$128+8,%rsp
|
||||
ret
|
||||
.Lend_mul_1x1:
|
||||
.size _mul_1x1,.-_mul_1x1
|
||||
___
|
||||
|
||||
($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
|
||||
("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
|
||||
|
||||
$code.=<<___;
|
||||
.extern OPENSSL_ia32cap_P
|
||||
.globl bn_GF2m_mul_2x2
|
||||
.type bn_GF2m_mul_2x2,\@abi-omnipotent
|
||||
.align 16
|
||||
bn_GF2m_mul_2x2:
|
||||
mov OPENSSL_ia32cap_P(%rip),%rax
|
||||
bt \$33,%rax
|
||||
jnc .Lvanilla_mul_2x2
|
||||
|
||||
movq $a1,%xmm0
|
||||
movq $b1,%xmm1
|
||||
movq $a0,%xmm2
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movq 40(%rsp),%xmm3
|
||||
___
|
||||
$code.=<<___ if (!$win64);
|
||||
movq $b0,%xmm3
|
||||
___
|
||||
$code.=<<___;
|
||||
movdqa %xmm0,%xmm4
|
||||
movdqa %xmm1,%xmm5
|
||||
pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
|
||||
pxor %xmm2,%xmm4
|
||||
pxor %xmm3,%xmm5
|
||||
pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
|
||||
pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
|
||||
xorps %xmm0,%xmm4
|
||||
xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
|
||||
movdqa %xmm4,%xmm5
|
||||
pslldq \$8,%xmm4
|
||||
psrldq \$8,%xmm5
|
||||
pxor %xmm4,%xmm2
|
||||
pxor %xmm5,%xmm0
|
||||
movdqu %xmm2,0($rp)
|
||||
movdqu %xmm0,16($rp)
|
||||
ret
|
||||
|
||||
.align 16
|
||||
.Lvanilla_mul_2x2:
|
||||
lea -8*17(%rsp),%rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
mov `8*17+40`(%rsp),$b0
|
||||
mov %rdi,8*15(%rsp)
|
||||
mov %rsi,8*16(%rsp)
|
||||
___
|
||||
$code.=<<___;
|
||||
mov %r14,8*10(%rsp)
|
||||
mov %r13,8*11(%rsp)
|
||||
mov %r12,8*12(%rsp)
|
||||
mov %rbp,8*13(%rsp)
|
||||
mov %rbx,8*14(%rsp)
|
||||
.Lbody_mul_2x2:
|
||||
mov $rp,32(%rsp) # save the arguments
|
||||
mov $a1,40(%rsp)
|
||||
mov $a0,48(%rsp)
|
||||
mov $b1,56(%rsp)
|
||||
mov $b0,64(%rsp)
|
||||
|
||||
mov \$0xf,$mask
|
||||
mov $a1,$a
|
||||
mov $b1,$b
|
||||
call _mul_1x1 # a1·b1
|
||||
mov $lo,16(%rsp)
|
||||
mov $hi,24(%rsp)
|
||||
|
||||
mov 48(%rsp),$a
|
||||
mov 64(%rsp),$b
|
||||
call _mul_1x1 # a0·b0
|
||||
mov $lo,0(%rsp)
|
||||
mov $hi,8(%rsp)
|
||||
|
||||
mov 40(%rsp),$a
|
||||
mov 56(%rsp),$b
|
||||
xor 48(%rsp),$a
|
||||
xor 64(%rsp),$b
|
||||
call _mul_1x1 # (a0+a1)·(b0+b1)
|
||||
___
|
||||
@r=("%rbx","%rcx","%rdi","%rsi");
|
||||
$code.=<<___;
|
||||
mov 0(%rsp),@r[0]
|
||||
mov 8(%rsp),@r[1]
|
||||
mov 16(%rsp),@r[2]
|
||||
mov 24(%rsp),@r[3]
|
||||
mov 32(%rsp),%rbp
|
||||
|
||||
xor $hi,$lo
|
||||
xor @r[1],$hi
|
||||
xor @r[0],$lo
|
||||
mov @r[0],0(%rbp)
|
||||
xor @r[2],$hi
|
||||
mov @r[3],24(%rbp)
|
||||
xor @r[3],$lo
|
||||
xor @r[3],$hi
|
||||
xor $hi,$lo
|
||||
mov $hi,16(%rbp)
|
||||
mov $lo,8(%rbp)
|
||||
|
||||
mov 8*10(%rsp),%r14
|
||||
mov 8*11(%rsp),%r13
|
||||
mov 8*12(%rsp),%r12
|
||||
mov 8*13(%rsp),%rbp
|
||||
mov 8*14(%rsp),%rbx
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
mov 8*15(%rsp),%rdi
|
||||
mov 8*16(%rsp),%rsi
|
||||
___
|
||||
$code.=<<___;
|
||||
lea 8*17(%rsp),%rsp
|
||||
ret
|
||||
.Lend_mul_2x2:
|
||||
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
|
||||
.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 16
|
||||
___
|
||||
|
||||
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
||||
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
||||
if ($win64) {
|
||||
$rec="%rcx";
|
||||
$frame="%rdx";
|
||||
$context="%r8";
|
||||
$disp="%r9";
|
||||
|
||||
$code.=<<___;
|
||||
.extern __imp_RtlVirtualUnwind
|
||||
|
||||
.type se_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
se_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
mov 248($context),%rbx # pull context->Rip
|
||||
|
||||
lea .Lbody_mul_2x2(%rip),%r10
|
||||
cmp %r10,%rbx # context->Rip<"prologue" label
|
||||
jb .Lin_prologue
|
||||
|
||||
mov 8*10(%rax),%r14 # mimic epilogue
|
||||
mov 8*11(%rax),%r13
|
||||
mov 8*12(%rax),%r12
|
||||
mov 8*13(%rax),%rbp
|
||||
mov 8*14(%rax),%rbx
|
||||
mov 8*15(%rax),%rdi
|
||||
mov 8*16(%rax),%rsi
|
||||
|
||||
mov %rbx,144($context) # restore context->Rbx
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
mov %rsi,168($context) # restore context->Rsi
|
||||
mov %rdi,176($context) # restore context->Rdi
|
||||
mov %r12,216($context) # restore context->R12
|
||||
mov %r13,224($context) # restore context->R13
|
||||
mov %r14,232($context) # restore context->R14
|
||||
|
||||
.Lin_prologue:
|
||||
lea 8*17(%rax),%rax
|
||||
mov %rax,152($context) # restore context->Rsp
|
||||
|
||||
mov 40($disp),%rdi # disp->ContextRecord
|
||||
mov $context,%rsi # context
|
||||
mov \$154,%ecx # sizeof(CONTEXT)
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
|
||||
mov $disp,%rsi
|
||||
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
||||
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
||||
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
||||
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
||||
mov 40(%rsi),%r10 # disp->ContextRecord
|
||||
lea 56(%rsi),%r11 # &disp->HandlerData
|
||||
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
||||
mov %r10,32(%rsp) # arg5
|
||||
mov %r11,40(%rsp) # arg6
|
||||
mov %r12,48(%rsp) # arg7
|
||||
mov %rcx,56(%rsp) # arg8, (NULL)
|
||||
call *__imp_RtlVirtualUnwind(%rip)
|
||||
|
||||
mov \$1,%eax # ExceptionContinueSearch
|
||||
add \$64,%rsp
|
||||
popfq
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
ret
|
||||
.size se_handler,.-se_handler
|
||||
|
||||
.section .pdata
|
||||
.align 4
|
||||
.rva _mul_1x1
|
||||
.rva .Lend_mul_1x1
|
||||
.rva .LSEH_info_1x1
|
||||
|
||||
.rva .Lvanilla_mul_2x2
|
||||
.rva .Lend_mul_2x2
|
||||
.rva .LSEH_info_2x2
|
||||
.section .xdata
|
||||
.align 8
|
||||
.LSEH_info_1x1:
|
||||
.byte 0x01,0x07,0x02,0x00
|
||||
.byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
|
||||
.LSEH_info_2x2:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
___
|
||||
}
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
||||
1407
openssl-1.0.2f/crypto/bn/asm/x86_64-mont.pl
Executable file
1407
openssl-1.0.2f/crypto/bn/asm/x86_64-mont.pl
Executable file
File diff suppressed because it is too large
Load Diff
3540
openssl-1.0.2f/crypto/bn/asm/x86_64-mont5.pl
Executable file
3540
openssl-1.0.2f/crypto/bn/asm/x86_64-mont5.pl
Executable file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user