Initial Commit

This commit is contained in:
root
2017-02-25 23:55:24 +01:00
commit 1fe2e8ab62
4868 changed files with 1487355 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
<OBSOLETE>
All assember in this directory are just version of the file
crypto/bn/bn_asm.c.
Quite a few of these files are just the assember output from gcc since on
quite a few machines they are 2 times faster than the system compiler.
For the x86, I have hand written assember because of the bad job all
compilers seem to do on it. This normally gives a 2 time speed up in the RSA
routines.
For the DEC alpha, I also hand wrote the assember (except the division which
is just the output from the C compiler pasted on the end of the file).
On the 2 alpha C compilers I had access to, it was not possible to do
64b x 64b -> 128b calculations (both long and the long long data types
were 64 bits). So the hand assember gives access to the 128 bit result and
a 2 times speedup :-).
There are 3 versions of assember for the HP PA-RISC.
pa-risc.s is the origional one which works fine and generated using gcc :-)
pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
by Chris Ruemmler from HP (with some help from the HP C compiler).
</OBSOLETE>

View File

@@ -0,0 +1,321 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# On 21264 RSA sign performance improves by 70/35/20/15 percent for
# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
# instructed to '-tune host' code with in-line assembler. Other
# benchmarks improve by 15-20%. To anchor it to something else, the
# code provides approximately the same performance per GHz as AMD64.
# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
# difference.
# int bn_mul_mont(
$rp="a0"; # BN_ULONG *rp,
$ap="a1"; # const BN_ULONG *ap,
$bp="a2"; # const BN_ULONG *bp,
$np="a3"; # const BN_ULONG *np,
$n0="a4"; # const BN_ULONG *n0,
$num="a5"; # int num);
$lo0="t0";
$hi0="t1";
$lo1="t2";
$hi1="t3";
$aj="t4";
$bi="t5";
$nj="t6";
$tp="t7";
$alo="t8";
$ahi="t9";
$nlo="t10";
$nhi="t11";
$tj="t12";
$i="s3";
$j="s4";
$m1="s5";
$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl bn_mul_mont
.align 5
.ent bn_mul_mont
bn_mul_mont:
lda sp,-48(sp)
stq ra,0(sp)
stq s3,8(sp)
stq s4,16(sp)
stq s5,24(sp)
stq fp,32(sp)
mov sp,fp
.mask 0x0400f000,-48
.frame fp,48,ra
.prologue 0
.align 4
.set reorder
sextl $num,$num
mov 0,v0
cmplt $num,4,AT
bne AT,.Lexit
ldq $hi0,0($ap) # ap[0]
s8addq $num,16,AT
ldq $aj,8($ap)
subq sp,AT,sp
ldq $bi,0($bp) # bp[0]
lda AT,-4096(zero) # mov -4096,AT
ldq $n0,0($n0)
and sp,AT,sp
mulq $hi0,$bi,$lo0
ldq $hi1,0($np) # np[0]
umulh $hi0,$bi,$hi0
ldq $nj,8($np)
mulq $lo0,$n0,$m1
mulq $hi1,$m1,$lo1
umulh $hi1,$m1,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,AT
addq $hi1,AT,$hi1
mulq $aj,$bi,$alo
mov 2,$j
umulh $aj,$bi,$ahi
mov sp,$tp
mulq $nj,$m1,$nlo
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
s8addq $j,$np,$nj
.align 4
.L1st:
.set noreorder
ldq $aj,0($aj)
addl $j,1,$j
ldq $nj,0($nj)
lda $tp,8($tp)
addq $alo,$hi0,$lo0
mulq $aj,$bi,$alo
cmpult $lo0,$hi0,AT
addq $nlo,$hi1,$lo1
mulq $nj,$m1,$nlo
addq $ahi,AT,$hi0
cmpult $lo1,$hi1,v0
cmplt $j,$num,$tj
umulh $aj,$bi,$ahi
addq $nhi,v0,$hi1
addq $lo1,$lo0,$lo1
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
cmpult $lo1,$lo0,v0
addq $hi1,v0,$hi1
s8addq $j,$np,$nj
stq $lo1,-8($tp)
nop
unop
bne $tj,.L1st
.set reorder
addq $alo,$hi0,$lo0
addq $nlo,$hi1,$lo1
cmpult $lo0,$hi0,AT
cmpult $lo1,$hi1,v0
addq $ahi,AT,$hi0
addq $nhi,v0,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,v0
addq $hi1,v0,$hi1
stq $lo1,0($tp)
addq $hi1,$hi0,$hi1
cmpult $hi1,$hi0,AT
stq $hi1,8($tp)
stq AT,16($tp)
mov 1,$i
.align 4
.Louter:
s8addq $i,$bp,$bi
ldq $hi0,0($ap)
ldq $aj,8($ap)
ldq $bi,0($bi)
ldq $hi1,0($np)
ldq $nj,8($np)
ldq $tj,0(sp)
mulq $hi0,$bi,$lo0
umulh $hi0,$bi,$hi0
addq $lo0,$tj,$lo0
cmpult $lo0,$tj,AT
addq $hi0,AT,$hi0
mulq $lo0,$n0,$m1
mulq $hi1,$m1,$lo1
umulh $hi1,$m1,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,AT
mov 2,$j
addq $hi1,AT,$hi1
mulq $aj,$bi,$alo
mov sp,$tp
umulh $aj,$bi,$ahi
mulq $nj,$m1,$nlo
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
.align 4
.Linner:
.set noreorder
ldq $tj,8($tp) #L0
nop #U1
ldq $aj,0($aj) #L1
s8addq $j,$np,$nj #U0
ldq $nj,0($nj) #L0
nop #U1
addq $alo,$hi0,$lo0 #L1
lda $tp,8($tp)
mulq $aj,$bi,$alo #U1
cmpult $lo0,$hi0,AT #L0
addq $nlo,$hi1,$lo1 #L1
addl $j,1,$j
mulq $nj,$m1,$nlo #U1
addq $ahi,AT,$hi0 #L0
addq $lo0,$tj,$lo0 #L1
cmpult $lo1,$hi1,v0 #U0
umulh $aj,$bi,$ahi #U1
cmpult $lo0,$tj,AT #L0
addq $lo1,$lo0,$lo1 #L1
addq $nhi,v0,$hi1 #U0
umulh $nj,$m1,$nhi #U1
s8addq $j,$ap,$aj #L0
cmpult $lo1,$lo0,v0 #L1
cmplt $j,$num,$tj #U0 # borrow $tj
addq $hi0,AT,$hi0 #L0
addq $hi1,v0,$hi1 #U1
stq $lo1,-8($tp) #L1
bne $tj,.Linner #U0
.set reorder
ldq $tj,8($tp)
addq $alo,$hi0,$lo0
addq $nlo,$hi1,$lo1
cmpult $lo0,$hi0,AT
cmpult $lo1,$hi1,v0
addq $ahi,AT,$hi0
addq $nhi,v0,$hi1
addq $lo0,$tj,$lo0
cmpult $lo0,$tj,AT
addq $hi0,AT,$hi0
ldq $tj,16($tp)
addq $lo1,$lo0,$j
cmpult $j,$lo0,v0
addq $hi1,v0,$hi1
addq $hi1,$hi0,$lo1
stq $j,0($tp)
cmpult $lo1,$hi0,$hi1
addq $lo1,$tj,$lo1
cmpult $lo1,$tj,AT
addl $i,1,$i
addq $hi1,AT,$hi1
stq $lo1,8($tp)
cmplt $i,$num,$tj # borrow $tj
stq $hi1,16($tp)
bne $tj,.Louter
s8addq $num,sp,$tj # &tp[num]
mov $rp,$bp # put rp aside
mov sp,$tp
mov sp,$ap
mov 0,$hi0 # clear borrow bit
.align 4
.Lsub: ldq $lo0,0($tp)
ldq $lo1,0($np)
lda $tp,8($tp)
lda $np,8($np)
subq $lo0,$lo1,$lo1 # tp[i]-np[i]
cmpult $lo0,$lo1,AT
subq $lo1,$hi0,$lo0
cmpult $lo1,$lo0,$hi0
or $hi0,AT,$hi0
stq $lo0,0($rp)
cmpult $tp,$tj,v0
lda $rp,8($rp)
bne v0,.Lsub
subq $hi1,$hi0,$hi0 # handle upmost overflow bit
mov sp,$tp
mov $bp,$rp # restore rp
and sp,$hi0,$ap
bic $bp,$hi0,$bp
bis $bp,$ap,$ap # ap=borrow?tp:rp
.align 4
.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
lda $tp,8($tp)
lda $rp,8($rp)
lda $ap,8($ap)
stq zero,-8($tp) # zap tp
cmpult $tp,$tj,AT
stq $aj,-8($rp)
bne AT,.Lcopy
mov 1,v0
.Lexit:
.set noreorder
mov fp,sp
/*ldq ra,0(sp)*/
ldq s3,8(sp)
ldq s4,16(sp)
ldq s5,24(sp)
ldq fp,32(sp)
lda sp,48(sp)
ret (ra)
.end bn_mul_mont
.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
print $code;
close STDOUT;

View File

@@ -0,0 +1,289 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... Except that it has two code paths: pure
# integer code suitable for any ARMv4 and later CPU and NEON code
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
# faster than compiler-generated code. For ECDH and ECDSA verify (but
# not for ECDSA sign) it means 25%-45% improvement depending on key
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
#
# April 2014
#
# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
# referred below, which improves ECDH and ECDSA verify benchmarks
# by 18-40%.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$code=<<___;
#include "arm_arch.h"
.text
.code 32
___
################
# private interface to mul_1x1_ialu
#
$a="r1";
$b="r0";
($a0,$a1,$a2,$a12,$a4,$a14)=
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
$mask="r12";
$code.=<<___;
.type mul_1x1_ialu,%function
.align 5
mul_1x1_ialu:
mov $a0,#0
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
str $a0,[sp,#0] @ tab[0]=0
add $a2,$a1,$a1 @ a2=a1<<1
str $a1,[sp,#4] @ tab[1]=a1
eor $a12,$a1,$a2 @ a1^a2
str $a2,[sp,#8] @ tab[2]=a2
mov $a4,$a1,lsl#2 @ a4=a1<<2
str $a12,[sp,#12] @ tab[3]=a1^a2
eor $a14,$a1,$a4 @ a1^a4
str $a4,[sp,#16] @ tab[4]=a4
eor $a0,$a2,$a4 @ a2^a4
str $a14,[sp,#20] @ tab[5]=a1^a4
eor $a12,$a12,$a4 @ a1^a2^a4
str $a0,[sp,#24] @ tab[6]=a2^a4
and $i0,$mask,$b,lsl#2
str $a12,[sp,#28] @ tab[7]=a1^a2^a4
and $i1,$mask,$b,lsr#1
ldr $lo,[sp,$i0] @ tab[b & 0x7]
and $i0,$mask,$b,lsr#4
ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
and $i1,$mask,$b,lsr#7
ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
eor $lo,$lo,$t1,lsl#3 @ stall
mov $hi,$t1,lsr#29
ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
and $i0,$mask,$b,lsr#10
eor $lo,$lo,$t0,lsl#6
eor $hi,$hi,$t0,lsr#26
ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
and $i1,$mask,$b,lsr#13
eor $lo,$lo,$t1,lsl#9
eor $hi,$hi,$t1,lsr#23
ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
and $i0,$mask,$b,lsr#16
eor $lo,$lo,$t0,lsl#12
eor $hi,$hi,$t0,lsr#20
ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
and $i1,$mask,$b,lsr#19
eor $lo,$lo,$t1,lsl#15
eor $hi,$hi,$t1,lsr#17
ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
and $i0,$mask,$b,lsr#22
eor $lo,$lo,$t0,lsl#18
eor $hi,$hi,$t0,lsr#14
ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
and $i1,$mask,$b,lsr#25
eor $lo,$lo,$t1,lsl#21
eor $hi,$hi,$t1,lsr#11
ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
tst $a,#1<<30
and $i0,$mask,$b,lsr#28
eor $lo,$lo,$t0,lsl#24
eor $hi,$hi,$t0,lsr#8
ldr $t0,[sp,$i0] @ tab[b >> 30 ]
eorne $lo,$lo,$b,lsl#30
eorne $hi,$hi,$b,lsr#2
tst $a,#1<<31
eor $lo,$lo,$t1,lsl#27
eor $hi,$hi,$t1,lsr#5
eorne $lo,$lo,$b,lsl#31
eorne $hi,$hi,$b,lsr#1
eor $lo,$lo,$t0,lsl#30
eor $hi,$hi,$t0,lsr#2
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
___
################
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
{
$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
#if __ARM_MAX_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
tst r12,#1
bne .LNEON
#endif
___
$ret="r10"; # reassigned 1st argument
$code.=<<___;
stmdb sp!,{r4-r10,lr}
mov $ret,r0 @ reassign 1st argument
mov $b,r3 @ $b=b1
ldr r3,[sp,#32] @ load b0
mov $mask,#7<<2
sub sp,sp,#32 @ allocate tab[8]
bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
eor $b,$b,r3 @ flip b0 and b1
eor $a,$a,r2 @ flip a0 and a1
eor r3,r3,$b
eor r2,r2,$a
eor $b,$b,r3
eor $a,$a,r2
bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
eor $a,$a,r2
eor $b,$b,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
ldmia $ret,{@r[0]-@r[3]}
eor $lo,$lo,$hi
eor $hi,$hi,@r[1]
eor $lo,$lo,@r[0]
eor $hi,$hi,@r[2]
eor $lo,$lo,@r[3]
eor $hi,$hi,@r[3]
str $hi,[$ret,#8]
eor $lo,$lo,$hi
add sp,sp,#32 @ destroy tab[8]
str $lo,[$ret,#4]
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
#else
ldmia sp!,{r4-r10,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
___
}
{
my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.align 5
.LNEON:
ldr r12, [sp] @ 5th argument
vmov.32 $a, r2, r1
vmov.32 $b, r12, r3
vmov.i64 $k48, #0x0000ffffffffffff
vmov.i64 $k32, #0x00000000ffffffff
vmov.i64 $k16, #0x000000000000ffff
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
vst1.32 {$r}, [r0]
ret @ bx lr
#endif
___
}
$code.=<<___;
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-(.Lpic+8)
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush

View File

@@ -0,0 +1,676 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2007.
# Montgomery multiplication for ARMv4.
#
# Performance improvement naturally varies among CPU implementations
# and compilers. The code was observed to provide +65-35% improvement
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
# November 2013
#
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
# performance improvement on Cortex-A8 is ~45-100% depending on key
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
# On Snapdragon S4 improvement was measured to vary from ~70% to
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
# rather because original integer-only code seems to perform
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
# different. It's being looked into, but the trouble is that
# performance for vectors longer than 256 bits is actually couple
# of percent worse than for integer-only code. The code is chosen
# for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
$np="r3";
$tp="r4";
$aj="r5";
$nj="r6";
$tj="r7";
$n0="r8";
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
$alo="r10"; # sl, gcc uses it to keep @GOT
$ahi="r11"; # fp
$nlo="r12"; # ip
########### # r13 is stack pointer
$nhi="r14"; # lr
########### # r15 is program counter
#### argument block layout relative to &tp[num-1], a.k.a. $num
$_rp="$num,#12*4";
# ap permanently resides in r1
$_bp="$num,#13*4";
# np permanently resides in r3
$_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
#include "arm_arch.h"
.text
.code 32
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-bn_mul_mont
#endif
.global bn_mul_mont
.type bn_mul_mont,%function
.align 5
bn_mul_mont:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
#if __ARM_MAX_ARCH__>=7
tst ip,#7
bne .Lialu
adr r0,bn_mul_mont
ldr r2,.LOPENSSL_armcap
ldr r0,[r0,r2]
tst r0,#1 @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
add sp,sp,#8
b bn_mul8x_mont_neon
.align 4
.Lialu:
#endif
cmp ip,#2
mov $num,ip @ load num
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
mov $num,$num,lsl#2 @ rescale $num for byte count
sub sp,sp,$num @ alloca(4*num)
sub sp,sp,#4 @ +extra dword
sub $num,$num,#4 @ "num=num-1"
add $tp,$bp,$num @ &bp[num-1]
add $num,sp,$num @ $num to point at &tp[num-1]
ldr $n0,[$_n0] @ &n0
ldr $bi,[$bp] @ bp[0]
ldr $aj,[$ap],#4 @ ap[0],ap++
ldr $nj,[$np],#4 @ np[0],np++
ldr $n0,[$n0] @ *n0
str $tp,[$_bpend] @ save &bp[num]
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
str $n0,[$_n0] @ save n0 value
mul $n0,$alo,$n0 @ "tp[0]"*n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
mov $tp,sp
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .L1st
adds $nlo,$nlo,$ahi
ldr $tp,[$_bp] @ restore bp
mov $nhi,#0
ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,sp @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
ldr $bi,[$tp,#4]! @ *(++bp)
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
ldr $alo,[sp] @ tp[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
str $tp,[$_bp] @ save bp
mul $n0,$alo,$n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
mov $tp,sp
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adc $ahi,$ahi,#0
ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .Linner
adds $nlo,$nlo,$ahi
mov $nhi,#0
ldr $tp,[$_bp] @ restore bp
adc $nhi,$nhi,#0
ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
ldr $tj,[$_bpend] @ restore &bp[num]
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj
bne .Louter
ldr $rp,[$_rp] @ pull rp
add $num,$num,#4 @ $num to point at &tp[num]
sub $aj,$num,sp @ "original" num value
mov $tp,sp @ "rewind" $tp
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
sbcs $tj,$tj,$nj @ tp[j]-np[j]
str $tj,[$rp],#4 @ rp[j]=
teq $tp,$num @ preserve carry
bne .Lsub
sbcs $nhi,$nhi,#0 @ upmost carry
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
and $ap,$tp,$nhi
bic $np,$rp,$nhi
orr $ap,$ap,$np @ ap=borrow?tp:rp
.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
str sp,[$tp],#4 @ zap tp
str $tj,[$rp],#4
cmp $tp,$num
bne .Lcopy
add sp,$num,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt:
#if __ARM_ARCH__>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size bn_mul_mont,.-bn_mul_mont
___
{
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
my ($Z,$Temp)=("q4","q5");
my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
my $zero=&Dlo($Z);
my $temp=&Dlo($Temp);
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type bn_mul8x_mont_neon,%function
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4-r11}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load rest of parameter block
sub $toutptr,sp,#16
vld1.32 {${Bi}[0]}, [$bptr,:32]!
sub $toutptr,$toutptr,$num,lsl#4
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
and $toutptr,$toutptr,#-64
vld1.32 {${M0}[0]}, [$n0,:32]
mov sp,$toutptr @ alloca
veor $zero,$zero,$zero
subs $inner,$num,#8
vzip.16 $Bi,$zero
vmull.u32 $A0xB,$Bi,${A0}[0]
vmull.u32 $A1xB,$Bi,${A0}[1]
vmull.u32 $A2xB,$Bi,${A1}[0]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
vmull.u32 $A3xB,$Bi,${A1}[1]
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
veor $zero,$zero,$zero
vmul.u32 $Ni,$temp,$M0
vmull.u32 $A4xB,$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmull.u32 $A5xB,$Bi,${A2}[1]
vmull.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmull.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_1st
@ special case for num=8, everything is in register bank...
vmlal.u32 $A0xB,$Ni,${N0}[0]
sub $outer,$num,#1
vmlal.u32 $A1xB,$Ni,${N0}[1]
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vmov $Temp,$A0xB
vmlal.u32 $A5xB,$Ni,${N2}[1]
vmov $A0xB,$A1xB
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmov $A1xB,$A2xB
vmlal.u32 $A7xB,$Ni,${N3}[1]
vmov $A2xB,$A3xB
vmov $A3xB,$A4xB
vshr.u64 $temp,$temp,#16
vmov $A4xB,$A5xB
vmov $A5xB,$A6xB
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vmov $A6xB,$A7xB
veor $A7xB,$A7xB
vshr.u64 $temp,$temp,#16
b .LNEON_outer8
.align 4
.LNEON_outer8:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vmlal.u32 $A0xB,$Bi,${A0}[0]
vmlal.u32 $A1xB,$Bi,${A0}[1]
vmlal.u32 $A2xB,$Bi,${A1}[0]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
vmlal.u32 $A3xB,$Bi,${A1}[1]
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
veor $zero,$zero,$zero
subs $outer,$outer,#1
vmul.u32 $Ni,$temp,$M0
vmlal.u32 $A4xB,$Bi,${A2}[0]
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 $A7xB,$Bi,${A3}[1]
vmlal.u32 $A0xB,$Ni,${N0}[0]
vmlal.u32 $A1xB,$Ni,${N0}[1]
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vmov $Temp,$A0xB
vmlal.u32 $A5xB,$Ni,${N2}[1]
vmov $A0xB,$A1xB
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmov $A1xB,$A2xB
vmlal.u32 $A7xB,$Ni,${N3}[1]
vmov $A2xB,$A3xB
vmov $A3xB,$A4xB
vshr.u64 $temp,$temp,#16
vmov $A4xB,$A5xB
vmov $A5xB,$A6xB
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vmov $A6xB,$A7xB
veor $A7xB,$A7xB
vshr.u64 $temp,$temp,#16
bne .LNEON_outer8
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
mov $toutptr,sp
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
mov $inner,$num
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
add $tinptr,sp,#16
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
b .LNEON_tail2
.align 4
.LNEON_1st:
vmlal.u32 $A0xB,$Ni,${N0}[0]
vld1.32 {$A0-$A3}, [$aptr]!
vmlal.u32 $A1xB,$Ni,${N0}[1]
subs $inner,$inner,#8
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vld1.32 {$N0-$N1}, [$nptr]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmull.u32 $A0xB,$Bi,${A0}[0]
vld1.32 {$N2-$N3}, [$nptr]!
vmull.u32 $A1xB,$Bi,${A0}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vmull.u32 $A2xB,$Bi,${A1}[0]
vmull.u32 $A3xB,$Bi,${A1}[1]
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vmull.u32 $A4xB,$Bi,${A2}[0]
vmull.u32 $A5xB,$Bi,${A2}[1]
vmull.u32 $A6xB,$Bi,${A3}[0]
vmull.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_1st
vmlal.u32 $A0xB,$Ni,${N0}[0]
add $tinptr,sp,#16
vmlal.u32 $A1xB,$Ni,${N0}[1]
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
vmlal.u32 $A2xB,$Ni,${N1}[0]
vld1.64 {$Temp}, [sp,:128]
vmlal.u32 $A3xB,$Ni,${N1}[1]
sub $outer,$num,#1
vmlal.u32 $A4xB,$Ni,${N2}[0]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vshr.u64 $temp,$temp,#16
vld1.64 {$A0xB}, [$tinptr, :128]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
veor $Z,$Z,$Z
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vst1.64 {$Z}, [$toutptr,:128]
vshr.u64 $temp,$temp,#16
b .LNEON_outer
.align 4
.LNEON_outer:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
vld1.32 {$A0-$A3}, [$aptr]!
veor $zero,$zero,$zero
mov $toutptr,sp
vzip.16 $Bi,$zero
sub $inner,$num,#8
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vmlal.u32 $A0xB,$Bi,${A0}[0]
vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
vmlal.u32 $A1xB,$Bi,${A0}[1]
vmlal.u32 $A2xB,$Bi,${A1}[0]
vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
vmlal.u32 $A3xB,$Bi,${A1}[1]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
veor $zero,$zero,$zero
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
vld1.64 {$A7xB},[$tinptr,:128]!
vmul.u32 $Ni,$temp,$M0
vmlal.u32 $A4xB,$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 $A7xB,$Bi,${A3}[1]
.LNEON_inner:
vmlal.u32 $A0xB,$Ni,${N0}[0]
vld1.32 {$A0-$A3}, [$aptr]!
vmlal.u32 $A1xB,$Ni,${N0}[1]
subs $inner,$inner,#8
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A4xB,$Ni,${N2}[0]
vld1.64 {$A0xB}, [$tinptr, :128]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vmlal.u32 $A0xB,$Bi,${A0}[0]
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
vmlal.u32 $A1xB,$Bi,${A0}[1]
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vmlal.u32 $A2xB,$Bi,${A1}[0]
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
vmlal.u32 $A3xB,$Bi,${A1}[1]
vld1.32 {$N0-$N3}, [$nptr]!
vmlal.u32 $A4xB,$Bi,${A2}[0]
vld1.64 {$A7xB}, [$tinptr, :128]!
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vmlal.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_inner
vmlal.u32 $A0xB,$Ni,${N0}[0]
add $tinptr,sp,#16
vmlal.u32 $A1xB,$Ni,${N0}[1]
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
vmlal.u32 $A2xB,$Ni,${N1}[0]
vld1.64 {$Temp}, [sp,:128]
vmlal.u32 $A3xB,$Ni,${N1}[1]
subs $outer,$outer,#1
vmlal.u32 $A4xB,$Ni,${N2}[0]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vld1.64 {$A0xB}, [$tinptr, :128]!
vshr.u64 $temp,$temp,#16
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vshr.u64 $temp,$temp,#16
bne .LNEON_outer
mov $toutptr,sp
mov $inner,$num
.LNEON_tail:
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
vld1.64 {$A7xB}, [$tinptr, :128]!
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
.LNEON_tail2:
vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A1xB")`,#16
vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
vshr.u64 $temp,`&Dhi("$A1xB")`,#16
vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A2xB")`,#16
vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
vshr.u64 $temp,`&Dhi("$A2xB")`,#16
vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A3xB")`,#16
vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
vshr.u64 $temp,`&Dhi("$A3xB")`,#16
vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A4xB")`,#16
vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
vshr.u64 $temp,`&Dhi("$A4xB")`,#16
vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A5xB")`,#16
vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
vshr.u64 $temp,`&Dhi("$A5xB")`,#16
vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A6xB")`,#16
vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
vld1.64 {$A0xB}, [$tinptr, :128]!
vshr.u64 $temp,`&Dhi("$A6xB")`,#16
vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A7xB")`,#16
vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dhi("$A7xB")`,#16
vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
subs $inner,$inner,#8
vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
bne .LNEON_tail
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
subs $aptr,sp,#0 @ clear carry flag
add $bptr,sp,$num,lsl#2
.LNEON_sub:
ldmia $aptr!, {r4-r7}
ldmia $nptr!, {r8-r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_sub
ldr r10, [$aptr] @ load top-most bit
veor q0,q0,q0
sub r11,$bptr,sp @ this is num*4
veor q1,q1,q1
mov $aptr,sp
sub $rptr,$rptr,r11 @ rewind $rptr
mov $nptr,$bptr @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
.LNEON_copy_n_zap:
ldmia $aptr!, {r4-r7}
ldmia $rptr, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r11,r7
ldmia $aptr, {r4-r7}
stmia $rptr!, {r8-r11}
sub $aptr,$aptr,#16
ldmia $rptr, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r11,r7
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_copy_n_zap
sub sp,ip,#96
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
ret @ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
___
}
$code.=<<___;
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT;

View File

@@ -0,0 +1,774 @@
#!/usr/local/bin/perl
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&bn_mul_add_words("bn_mul_add_words");
&bn_mul_words("bn_mul_words");
&bn_sqr_words("bn_sqr_words");
&bn_div_words("bn_div_words");
&bn_add_words("bn_add_words");
&bn_sub_words("bn_sub_words");
&bn_sub_part_words("bn_sub_part_words");
&asm_finish();
sub bn_mul_add_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("maw_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
&jmp(&label("maw_sse2_entry"));
&set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
&movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
&pmuludq("mm2","mm0"); # mm2 = w*a[0]
&movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
&pmuludq("mm4","mm0"); # mm4 = w*a[1]
&movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
&pmuludq("mm6","mm0"); # mm6 = w*a[2]
&movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
&pmuludq("mm7","mm0"); # mm7 = w*a[3]
&paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
&movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
&paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
&movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
&paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
&movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
&paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
&movd(&DWP(0,$r,"",0),"mm1");
&movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
&pmuludq("mm2","mm0"); # mm2 = w*a[4]
&psrlq("mm1",32); # mm1 = carry0
&movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
&pmuludq("mm4","mm0"); # mm4 = w*a[5]
&paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
&movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
&pmuludq("mm6","mm0"); # mm6 = w*a[6]
&movd(&DWP(4,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry1
&movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
&add($a,32);
&pmuludq("mm3","mm0"); # mm3 = w*a[7]
&paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
&movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
&paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
&movd(&DWP(8,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry2
&paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
&movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
&paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
&movd(&DWP(12,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry3
&paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
&movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
&paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
&movd(&DWP(16,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry4
&paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
&movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
&paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
&movd(&DWP(20,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry5
&paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
&movd(&DWP(24,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry6
&paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
&movd(&DWP(28,$r,"",0),"mm1");
&lea($r,&DWP(32,$r));
&psrlq("mm1",32); # mm1 = carry_out
&sub($c,8);
&jz(&label("maw_sse2_exit"));
&set_label("maw_sse2_entry");
&test($c,0xfffffff8);
&jnz(&label("maw_sse2_unrolled"));
&set_label("maw_sse2_loop",4);
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
&movd("mm3",&DWP(0,$r)); # mm3 = r[i]
&pmuludq("mm2","mm0"); # a[i] *= w
&lea($a,&DWP(4,$a));
&paddq("mm1","mm3"); # carry += r[i]
&paddq("mm1","mm2"); # carry += a[i]*w
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
&sub($c,1);
&psrlq("mm1",32); # carry = carry_high
&lea($r,&DWP(4,$r));
&jnz(&label("maw_sse2_loop"));
&set_label("maw_sse2_exit");
&movd("eax","mm1"); # c = carry_out
&emms();
&ret();
&set_label("maw_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ebp";
$r="edi";
$c="esi";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov("ecx",&wparam(2)); #
&mov($a,&wparam(1)); #
&and("ecx",0xfffffff8); # num / 8
&mov($w,&wparam(3)); #
&push("ecx"); # Up the stack for a tmp variable
&jz(&label("maw_finish"));
&set_label("maw_loop",16);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+= c
&adc("edx",0); # H(t)+=carry
&add("eax",&DWP($i,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&sub("ecx",8);
&lea($a,&DWP(32,$a));
&lea($r,&DWP(32,$r));
&jnz(&label("maw_loop"));
&set_label("maw_finish",0);
&mov("ecx",&wparam(2)); # get num
&and("ecx",7);
&jnz(&label("maw_finish2")); # helps branch prediction
&jmp(&label("maw_end"));
&set_label("maw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
&adc("edx",0); # H(t)+=carry
&add("eax",&DWP($i*4,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&dec("ecx") if ($i != 7-1);
&mov(&DWP($i*4,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
&jz(&label("maw_end")) if ($i != 7-1);
}
&set_label("maw_end",0);
&mov("eax",$c);
&pop("ecx"); # clear variable from
&function_end($name);
}
sub bn_mul_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("mw_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry = 0
&set_label("mw_sse2_loop",16);
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
&pmuludq("mm2","mm0"); # a[i] *= w
&lea($a,&DWP(4,$a));
&paddq("mm1","mm2"); # carry += a[i]*w
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
&sub($c,1);
&psrlq("mm1",32); # carry = carry_high
&lea($r,&DWP(4,$r));
&jnz(&label("mw_sse2_loop"));
&movd("eax","mm1"); # return carry
&emms();
&ret();
&set_label("mw_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ecx";
$r="edi";
$c="esi";
$num="ebp";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&mov($w,&wparam(3)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("mw_finish"));
&set_label("mw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jz(&label("mw_finish"));
&jmp(&label("mw_loop"));
&set_label("mw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jnz(&label("mw_finish2"));
&jmp(&label("mw_end"));
&set_label("mw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
&mov($c,"edx"); # c= H(t);
&dec($num) if ($i != 7-1);
&jz(&label("mw_end")) if ($i != 7-1);
}
&set_label("mw_end",0);
&mov("eax",$c);
&function_end($name);
}
sub bn_sqr_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("sqr_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&set_label("sqr_sse2_loop",16);
&movd("mm0",&DWP(0,$a)); # mm0 = a[i]
&pmuludq("mm0","mm0"); # a[i] *= a[i]
&lea($a,&DWP(4,$a)); # a++
&movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
&sub($c,1);
&lea($r,&DWP(8,$r)); # r += 2
&jnz(&label("sqr_sse2_loop"));
&emms();
&ret();
&set_label("sqr_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$r="esi";
$a="edi";
$num="ebx";
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("sw_finish"));
&set_label("sw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*2,$r,"",0),"eax"); #
&mov(&DWP($i*2+4,$r,"",0),"edx");#
}
&comment("");
&add($a,32);
&add($r,64);
&sub($num,8);
&jnz(&label("sw_loop"));
&set_label("sw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jz(&label("sw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*8,$r,"",0),"eax"); #
&dec($num) if ($i != 7-1);
&mov(&DWP($i*8+4,$r,"",0),"edx");
&jz(&label("sw_end")) if ($i != 7-1);
}
&set_label("sw_end",0);
&function_end($name);
}
sub bn_div_words
{
local($name)=@_;
&function_begin_B($name,"");
&mov("edx",&wparam(0)); #
&mov("eax",&wparam(1)); #
&mov("ecx",&wparam(2)); #
&div("ecx");
&ret();
&function_end_B($name);
}
sub bn_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
sub bn_sub_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
sub bn_sub_part_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP(0,$a,"",0)); # *a
&mov($tmp2,&DWP(0,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP(0,$r,"",0),$tmp1); # *r
&add($a, 4);
&add($b, 4);
&add($r, 4);
&dec($num) if ($i != 6);
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
&cmp(&wparam(4),0);
&je(&label("pw_end"));
&mov($num,&wparam(4)); # get dl
&cmp($num,0);
&je(&label("pw_end"));
&jge(&label("pw_pos"));
&comment("pw_neg");
&mov($tmp2,0);
&sub($tmp2,$num);
&mov($num,$tmp2);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_neg_finish"));
&set_label("pw_neg_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("dl<0 Round $i");
&mov($tmp1,0);
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_neg_loop"));
&set_label("pw_neg_finish",0);
&mov($tmp2,&wparam(4)); # get dl
&mov($num,0);
&sub($num,$tmp2);
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl<0 Tail Round $i");
&mov($tmp1,0);
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("pw_end")) if ($i != 6);
}
&jmp(&label("pw_end"));
&set_label("pw_pos",0);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_pos_finish"));
&set_label("pw_pos_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("dl>0 Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&sub($tmp1,$c);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_nc".$i));
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_pos_loop"));
&set_label("pw_pos_finish",0);
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl>0 Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&sub($tmp1,$c);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_tail_nc".$i));
&dec($num) if ($i != 6);
&jz(&label("pw_end")) if ($i != 6);
}
&mov($c,1);
&jmp(&label("pw_end"));
&set_label("pw_nc_loop",0);
for ($i=0; $i<8; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_nc".$i,0);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_nc_loop"));
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_nc_end"));
for ($i=0; $i<7; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_tail_nc".$i,0);
&dec($num) if ($i != 6);
&jz(&label("pw_nc_end")) if ($i != 6);
}
&set_label("pw_nc_end",0);
&mov($c,0);
&set_label("pw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}

View File

@@ -0,0 +1,287 @@
#!/usr/local/bin/perl
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
&bn_mul_comba("bn_mul_comba8",8);
&bn_mul_comba("bn_mul_comba4",4);
&bn_sqr_comba("bn_sqr_comba8",8);
&bn_sqr_comba("bn_sqr_comba4",4);
&asm_finish();
sub mul_add_c
{
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("mul a[$ai]*b[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
&mul("edx");
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
sub sqr_add_c
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
sub sqr_add_c2
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$a,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add("eax","eax");
###
&adc("edx","edx");
###
&adc($c2,0);
&add($c0,"eax");
&adc($c1,"edx");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
&adc($c2,0);
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
###
}
sub bn_mul_comba
{
local($name,$num)=@_;
local($a,$b,$c0,$c1,$c2);
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($tot,$end);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
&push("esi");
&mov($a,&wparam(1));
&push("edi");
&mov($b,&wparam(2));
&push("ebp");
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($j+1) == $end)
{
$v=1;
$v=2 if (($i+1) == $tot);
}
else
{ $v=0; }
if (($j+1) != $end)
{
$na=($ai-1);
$nb=($bi+1);
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
if ($v)
{
&comment("saved r[$i]");
# &mov("eax",&wparam(0));
# &mov(&DWP($i*4,"eax","",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&comment("save r[$i]");
# &mov("eax",&wparam(0));
&mov(&DWP($i*4,"eax","",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
sub bn_sqr_comba
{
local($name,$num)=@_;
local($r,$a,$c0,$c1,$c2)=@_;
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($b,$tot,$end,$half);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$r="edi";
&push("esi");
&push("edi");
&push("ebp");
&push("ebx");
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&xor($c0,$c0);
&xor($c1,$c1);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("############### Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($ai-1) < ($bi+1))
{
$v=1;
$v=2 if ($i+1) == $tot;
}
else
{ $v=0; }
if (!$v)
{
$na=$ai-1;
$nb=$bi+1;
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
if ($ai == $bi)
{
&sqr_add_c($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
else
{
&sqr_add_c2($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
if ($v)
{
&comment("saved r[$i]");
#&mov(&DWP($i*4,$r,"",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
last;
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&mov(&DWP($i*4,$r,"",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}

View File

@@ -0,0 +1,851 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2010
#
# "Teaser" Montgomery multiplication module for IA-64. There are
# several possibilities for improvement:
#
# - modulo-scheduling outer loop would eliminate quite a number of
# stalls after ldf8, xma and getf.sig outside inner loop and
# improve shorter key performance;
# - shorter vector support [with input vectors being fetched only
# once] should be added;
# - 2x unroll with help of n0[1] would make the code scalable on
# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
# acute interest, because upcoming Tukwila's individual cores are
# reportedly based on Itanium 2 design;
# - dedicated squaring procedure(?);
#
# January 2010
#
# Shorter vector support is implemented by zero-padding ap and np
# vectors up to 8 elements, or 512 bits. This means that 256-bit
# inputs will be processed only 2 times faster than 512-bit inputs,
# not 4 [as one would expect, because algorithm complexity is n^2].
# The reason for padding is that inputs shorter than 512 bits won't
# be processed faster anyway, because minimal critical path of the
# core loop happens to match 512-bit timing. Either way, it resulted
# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
# 1024-bit one [in comparison to original version of *this* module].
#
# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
# this module is:
# sign verify sign/s verify/s
# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
#
# ... and *without* (but still with ia64.S):
#
# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
#
# As it can be seen, RSA sign performance improves by 130-30%,
# hereafter less for longer keys, while verify - by 74-13%.
# DSA performance improves by 115-30%.
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
$code=<<___;
.explicit
.text
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
// const BN_ULONG *bp,const BN_ULONG *np,
// const BN_ULONG *n0p,int num);
.align 64
.global bn_mul_mont#
.proc bn_mul_mont#
bn_mul_mont:
.prologue
.body
{ .mmi; cmp4.le p6,p7=2,r37;;
(p6) cmp4.lt.unc p8,p9=8,r37
mov ret0=r0 };;
{ .bbb;
(p9) br.cond.dptk.many bn_mul_mont_8
(p8) br.cond.dpnt.many bn_mul_mont_general
(p7) br.ret.spnt.many b0 };;
.endp bn_mul_mont#
prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
rptr=r8; aptr=r9; bptr=r14; nptr=r15;
tptr=r16; // &tp[0]
tp_1=r17; // &tp[-1]
num=r18; len=r19; lc=r20;
topbit=r21; // carry bit from tmp[num]
n0=f6;
m0=f7;
bi=f8;
.align 64
.local bn_mul_mont_general#
.proc bn_mul_mont_general#
bn_mul_mont_general:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,6,2,0,8
$ADDP aptr=0,in1
.save ar.lc,prevlc
mov prevlc=ar.lc }
{ .mmi; .vframe prevsp
mov prevsp=sp
$ADDP bptr=0,in2
.save pr,prevpr
mov prevpr=pr };;
.body
.rotf alo[6],nlo[4],ahi[8],nhi[6]
.rotr a[3],n[3],t[2]
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
ldf8 alo[4]=[aptr],16 // ap[0]
$ADDP r30=8,in1 };;
{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
ldf8 alo[2]=[aptr],16 // ap[2]
$ADDP in4=0,in4 };;
{ .mmi; ldf8 alo[1]=[r30] // ap[3]
ldf8 n0=[in4] // n0
$ADDP rptr=0,in0 }
{ .mmi; $ADDP nptr=0,in3
mov r31=16
zxt4 num=in5 };;
{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
shladd len=num,3,r0
shladd r31=num,3,r31 };;
{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
add lc=-5,num
sub r31=sp,r31 };;
{ .mfb; and sp=-16,r31 // alloca
xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
nop.b 0 }
{ .mfb; nop.m 0
xmpy.lu alo[4]=alo[4],bi
brp.loop.imp .L1st_ctop,.L1st_cend-16
};;
{ .mfi; nop.m 0
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
add tp_1=8,sp }
{ .mfi; nop.m 0
xma.lu alo[3]=alo[3],bi,ahi[2]
mov pr.rot=0x20001f<<16
// ------^----- (p40) at first (p23)
// ----------^^ p[16:20]=1
};;
{ .mfi; nop.m 0
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
mov ar.lc=lc }
{ .mfi; nop.m 0
fcvt.fxu.s1 nhi[1]=f0
mov ar.ec=8 };;
.align 32
.L1st_ctop:
.pred.rel "mutex",p40,p42
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
(p40) add n[2]=n[2],a[2] } // (p23) }
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
(p42) add n[2]=n[2],a[2],1 };; // (p23)
{ .mfi; (p21) getf.sig a[0]=alo[5]
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
(p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
{ .mfi; (p23) st8 [tp_1]=n[2],8
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
{ .mmb; (p21) getf.sig n[0]=nlo[3]
(p16) nop.m 0
br.ctop.sptk .L1st_ctop };;
.L1st_cend:
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
getf.sig n[0]=nhi[4]
add num=-1,num };; // num--
{ .mmi; .pred.rel "mutex",p40,p42
(p40) add n[0]=n[0],a[0]
(p42) add n[0]=n[0],a[0],1
sub aptr=aptr,len };; // rewind
{ .mmi; .pred.rel "mutex",p40,p42
(p40) cmp.ltu p41,p39=n[0],a[0]
(p42) cmp.leu p41,p39=n[0],a[0]
sub nptr=nptr,len };;
{ .mmi; .pred.rel "mutex",p39,p41
(p39) add topbit=r0,r0
(p41) add topbit=r0,r0,1
nop.i 0 }
{ .mmi; st8 [tp_1]=n[0]
add tptr=16,sp
add tp_1=8,sp };;
.Louter:
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
ldf8 ahi[3]=[tptr] // tp[0]
add r30=8,aptr };;
{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
ldf8 alo[3]=[r30],16 // ap[1]
add r31=8,nptr };;
{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
brp.loop.imp .Linner_ctop,.Linner_cend-16
}
{ .mfb; ldf8 alo[1]=[r30] // ap[3]
xma.lu alo[4]=alo[4],bi,ahi[3]
clrrrb.pr };;
{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
nop.i 0 }
{ .mfi; ldf8 nlo[1]=[r31] // np[1]
xma.lu alo[3]=alo[3],bi,ahi[2]
mov pr.rot=0x20101f<<16
// ------^----- (p40) at first (p23)
// --------^--- (p30) at first (p22)
// ----------^^ p[16:20]=1
};;
{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
mov ar.lc=lc }
{ .mfi;
fcvt.fxu.s1 nhi[1]=f0
mov ar.ec=8 };;
// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
// in latter case accounts for two-tick pipeline stall, which means
// that its performance would be ~20% lower than optimal one. No
// attempt was made to address this, because original Itanium is
// hardly represented out in the wild...
.align 32
.Linner_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p30,p32
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
(p40) add n[2]=n[2],a[2] } // (p23)
{ .mfi; (p16) nop.m 0
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
(p42) add n[2]=n[2],a[2],1 };; // (p23)
{ .mfi; (p21) getf.sig a[0]=alo[5]
(p16) nop.f 0
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
{ .mfi; (p21) ld8 t[0]=[tptr],8
(p16) nop.f 0
(p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
(p30) add a[1]=a[1],t[1] } // (p22)
{ .mfi; (p16) nop.m 0
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
(p32) add a[1]=a[1],t[1],1 };; // (p22)
{ .mmi; (p21) getf.sig n[0]=nlo[3]
(p16) nop.m 0
(p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
{ .mmb; (p23) st8 [tp_1]=n[2],8
(p32) cmp.leu p31,p29=a[1],t[1] // (p22)
br.ctop.sptk .Linner_ctop };;
.Linner_cend:
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
getf.sig n[0]=nhi[4]
nop.i 0 };;
{ .mmi; .pred.rel "mutex",p31,p33
(p31) add a[0]=a[0],topbit
(p33) add a[0]=a[0],topbit,1
mov topbit=r0 };;
{ .mfi; .pred.rel "mutex",p31,p33
(p31) cmp.ltu p32,p30=a[0],topbit
(p33) cmp.leu p32,p30=a[0],topbit
}
{ .mfi; .pred.rel "mutex",p40,p42
(p40) add n[0]=n[0],a[0]
(p42) add n[0]=n[0],a[0],1
};;
{ .mmi; .pred.rel "mutex",p44,p46
(p40) cmp.ltu p41,p39=n[0],a[0]
(p42) cmp.leu p41,p39=n[0],a[0]
(p32) add topbit=r0,r0,1 }
{ .mmi; st8 [tp_1]=n[0],8
cmp4.ne p6,p0=1,num
sub aptr=aptr,len };; // rewind
{ .mmi; sub nptr=nptr,len
(p41) add topbit=r0,r0,1
add tptr=16,sp }
{ .mmb; add tp_1=8,sp
add num=-1,num // num--
(p6) br.cond.sptk.many .Louter };;
{ .mbb; add lc=4,lc
brp.loop.imp .Lsub_ctop,.Lsub_cend-16
clrrrb.pr };;
{ .mii; nop.m 0
mov pr.rot=0x10001<<16
// ------^---- (p33) at first (p17)
mov ar.lc=lc }
{ .mii; nop.m 0
mov ar.ec=3
nop.i 0 };;
.Lsub_ctop:
.pred.rel "mutex",p33,p35
{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
(p16) nop.f 0
(p33) sub n[1]=t[1],n[1] } // (p17)
{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
(p16) nop.f 0
(p35) sub n[1]=t[1],n[1],1 };; // (p17)
{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
(p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
(p18) nop.b 0 }
{ .mib; (p18) nop.m 0
(p35) cmp.geu p34,p32=n[1],t[1] // (p17)
br.ctop.sptk .Lsub_ctop };;
.Lsub_cend:
{ .mmb; .pred.rel "mutex",p34,p36
(p34) sub topbit=topbit,r0 // (p19)
(p36) sub topbit=topbit,r0,1
brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
}
{ .mmb; sub rptr=rptr,len // rewind
sub tptr=tptr,len
clrrrb.pr };;
{ .mmi; and aptr=tptr,topbit
andcm bptr=rptr,topbit
mov pr.rot=1<<16 };;
{ .mii; or nptr=aptr,bptr
mov ar.lc=lc
mov ar.ec=3 };;
.Lcopy_ctop:
{ .mmb; (p16) ld8 n[0]=[nptr],8
(p18) st8 [tptr]=r0,8
(p16) nop.b 0 }
{ .mmb; (p16) nop.m 0
(p18) st8 [rptr]=n[2],8
br.ctop.sptk .Lcopy_ctop };;
.Lcopy_cend:
{ .mmi; mov ret0=1 // signal "handled"
rum 1<<5 // clear um.mfh
mov ar.lc=prevlc }
{ .mib; .restore sp
mov sp=prevsp
mov pr=prevpr,0x1ffff
br.ret.sptk.many b0 };;
.endp bn_mul_mont_general#
a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
t0=r15;
ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
.align 64
.skip 48 // aligns loop body
.local bn_mul_mont_8#
.proc bn_mul_mont_8#
bn_mul_mont_8:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,6,2,0,8
.vframe prevsp
mov prevsp=sp
.save ar.lc,prevlc
mov prevlc=ar.lc }
{ .mmi; add r17=-6*16,sp
add sp=-7*16,sp
.save pr,prevpr
mov prevpr=pr };;
{ .mmi; .save.gf 0,0x10
stf.spill [sp]=f16,-16
.save.gf 0,0x20
stf.spill [r17]=f17,32
add r16=-5*16,prevsp};;
{ .mmi; .save.gf 0,0x40
stf.spill [r16]=f18,32
.save.gf 0,0x80
stf.spill [r17]=f19,32
$ADDP aptr=0,in1 };;
{ .mmi; .save.gf 0,0x100
stf.spill [r16]=f20,32
.save.gf 0,0x200
stf.spill [r17]=f21,32
$ADDP r29=8,in1 };;
{ .mmi; .save.gf 0,0x400
stf.spill [r16]=f22
.save.gf 0,0x800
stf.spill [r17]=f23
$ADDP rptr=0,in0 };;
.body
.rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
.rotr t[8]
// load input vectors padding them to 8 elements
{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
ldf8 ai1=[r29],16 // ap[1]
$ADDP bptr=0,in2 }
{ .mmi; $ADDP r30=8,in2
$ADDP nptr=0,in3
$ADDP r31=8,in3 };;
{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
ldf8 bj[6]=[r30],16 // bp[1]
cmp4.le p4,p5=3,in5 }
{ .mmi; ldf8 ni0=[nptr],16 // np[0]
ldf8 ni1=[r31],16 // np[1]
cmp4.le p6,p7=4,in5 };;
{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
(p5)fcvt.fxu ai2=f0
cmp4.le p8,p9=5,in5 }
{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
(p7)fcvt.fxu ai3=f0
cmp4.le p10,p11=6,in5 }
{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
(p5)fcvt.fxu bj[5]=f0
cmp4.le p12,p13=7,in5 }
{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
(p7)fcvt.fxu bj[4]=f0
cmp4.le p14,p15=8,in5 }
{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
(p5)fcvt.fxu ni2=f0
addp4 r28=-1,in5 }
{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
(p7)fcvt.fxu ni3=f0
$ADDP in4=0,in4 };;
{ .mfi; ldf8 n0=[in4]
fcvt.fxu tf[1]=f0
nop.i 0 }
{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
(p9)fcvt.fxu ai4=f0
mov t[0]=r0 }
{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
(p11)fcvt.fxu ai5=f0
mov t[1]=r0 }
{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
(p9)fcvt.fxu bj[3]=f0
mov t[2]=r0 }
{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
(p11)fcvt.fxu bj[2]=f0
mov t[3]=r0 }
{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
(p9)fcvt.fxu ni4=f0
mov t[4]=r0 }
{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
(p11)fcvt.fxu ni5=f0
mov t[5]=r0 };;
{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
(p13)fcvt.fxu ai6=f0
mov t[6]=r0 }
{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
(p15)fcvt.fxu ai7=f0
mov t[7]=r0 }
{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
(p13)fcvt.fxu bj[1]=f0
mov ar.lc=r28 }
{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
(p15)fcvt.fxu bj[0]=f0
mov ar.ec=1 }
{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
(p13)fcvt.fxu ni6=f0
mov pr.rot=1<<16 }
{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
(p15)fcvt.fxu ni7=f0
brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
};;
// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
// to measure with help of Interval Time Counter indicated that the
// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
// addressing the issue is problematic, because I don't have access
// to platform-specific instruction-level profiler. On Itanium it
// should run in 56*n ticks, because of higher xma latency...
.Louter_8_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 0:
(p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
(p40) add a3=a3,n3 } // (p17) a3+=n3
{ .mfi; (p42) add a3=a3,n3,1
(p16) xma.lu alo[0]=ai0,bj[7],tf[1]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p16) nop.m 0 // 4:
(p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
(p41) add a4=a4,n4 } // (p17) a4+=n4
{ .mfi; (p43) add a4=a4,n4,1
(p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
(p16) nop.i 0 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p16) nop.m 0 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 8:
(p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
(p40) add a5=a5,n5 } // (p17) a5+=n5
{ .mfi; (p42) add a5=a5,n5,1
(p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a1=alo[1] // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mfi; (p16) nop.m 0 // 10:
(p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
(p40) cmp.ltu p43,p41=a5,n5 }
{ .mfi; (p42) cmp.leu p43,p41=a5,n5
(p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
(p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
(p41) add a6=a6,n6 } // (p17) a6+=n6
{ .mfi; (p43) add a6=a6,n6,1
(p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a2=alo[2] // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mfi; (p16) nop.m 0 // 14:
(p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
(p41) cmp.ltu p42,p40=a6,n6 }
{ .mfi; (p43) cmp.leu p42,p40=a6,n6
(p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
(p16) nop.i 0 };;
{ .mii; (p16) nop.m 0 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 16:
(p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
(p40) add a7=a7,n7 } // (p17) a7+=n7
{ .mfi; (p42) add a7=a7,n7,1
(p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a3=alo[3] // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mfi; (p16) nop.m 0 // 18:
(p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
(p40) cmp.ltu p43,p41=a7,n7 }
{ .mfi; (p42) cmp.leu p43,p41=a7,n7
(p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n1=nlo[1] // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p16) nop.m 0 // 20:
(p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
(p41) add a8=a8,n8 } // (p17) a8+=n8
{ .mfi; (p43) add a8=a8,n8,1
(p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a4=alo[4] // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 };;
{ .mfi; (p16) nop.m 0 // 22:
(p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
(p41) cmp.ltu p42,p40=a8,n8 }
{ .mfi; (p43) cmp.leu p42,p40=a8,n8
(p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n2=nlo[2] // 23:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 };;
{ .mfi; (p16) nop.m 0 // 24:
(p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
(p16) add a1=a1,n1 } // (p16) a1+=n1
{ .mfi; (p16) nop.m 0
(p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
(p17) mov t[0]=r0 };;
{ .mii; (p16) getf.sig a5=alo[5] // 25:
(p16) add t0=t[7],a1 // (p16) t[7]+=a1
(p42) add t[0]=t[0],r0,1 };;
{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
(p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
(p50) add t[0]=t[0],r0,1 }
{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
(p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n3=nlo[3] // 27:
(p16) cmp.ltu.unc p50,p48=t0,a1
(p16) nop.i 0 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 28:
(p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
(p40) add a2=a2,n2 } // (p16) a2+=n2
{ .mfi; (p42) add a2=a2,n2,1
(p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a6=alo[6] // 29:
(p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
(p50) add t[6]=t[6],a2,1 };;
{ .mfi; (p16) nop.m 0 // 30:
(p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
(p40) cmp.ltu p41,p39=a2,n2 }
{ .mfi; (p42) cmp.leu p41,p39=a2,n2
(p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
(p16) nop.i 0 };;
{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
(p16) nop.f 0
(p48) cmp.ltu p49,p47=t[6],a2 }
{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
(p16) nop.f 0
br.ctop.sptk.many .Louter_8_ctop };;
.Louter_8_cend:
// above loop has to execute one more time, without (p16), which is
// replaced with merged move of np[8] to GPR bank
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mmi; (p0) getf.sig n1=ni0 // 0:
(p40) add a3=a3,n3 // (p17) a3+=n3
(p42) add a3=a3,n3,1 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mmi; (p0) getf.sig n2=ni1 // 4:
(p41) add a4=a4,n4 // (p17) a4+=n4
(p43) add a4=a4,n4,1 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p0) nop.f 0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p0) getf.sig n3=ni2 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) getf.sig n4=ni3 // 8:
(p40) add a5=a5,n5 // (p17) a5+=n5
(p42) add a5=a5,n5,1 };;
{ .mii; (p0) nop.m 0 // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mii; (p0) nop.m 0 // 10:
(p40) cmp.ltu p43,p41=a5,n5
(p42) cmp.leu p43,p41=a5,n5 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p17) getf.sig n8=nhi[8] // 12:
(p41) add a6=a6,n6 // (p17) a6+=n6
(p43) add a6=a6,n6,1 };;
{ .mii; (p0) getf.sig n5=ni4 // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mii; (p0) nop.m 0 // 14:
(p41) cmp.ltu p42,p40=a6,n6
(p43) cmp.leu p42,p40=a6,n6 };;
{ .mii; (p0) getf.sig n6=ni5 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) nop.m 0 // 16:
(p40) add a7=a7,n7 // (p17) a7+=n7
(p42) add a7=a7,n7,1 };;
{ .mii; (p0) nop.m 0 // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mii; (p0) nop.m 0 // 18:
(p40) cmp.ltu p43,p41=a7,n7
(p42) cmp.leu p43,p41=a7,n7 };;
{ .mii; (p0) getf.sig n7=ni6 // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p0) nop.m 0 // 20:
(p41) add a8=a8,n8 // (p17) a8+=n8
(p43) add a8=a8,n8,1 };;
{ .mmi; (p0) nop.m 0 // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 }
{ .mmi; (p17) mov t[0]=r0
(p41) cmp.ltu p42,p40=a8,n8
(p43) cmp.leu p42,p40=a8,n8 };;
{ .mmi; (p0) getf.sig n8=ni7 // 22:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 }
{ .mmi; (p42) add t[0]=t[0],r0,1
(p0) add r16=-7*16,prevsp
(p0) add r17=-6*16,prevsp };;
// subtract np[8] from carrybit|tmp[8]
// carrybit|tmp[8] layout upon exit from above loop is:
// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
{ .mmi; (p50)add t[0]=t[0],r0,1
add r18=-5*16,prevsp
sub n1=t0,n1 };;
{ .mmi; cmp.gtu p34,p32=n1,t0;;
.pred.rel "mutex",p32,p34
(p32)sub n2=t[7],n2
(p34)sub n2=t[7],n2,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
(p34)cmp.geu p35,p33=n2,t[7];;
.pred.rel "mutex",p33,p35
(p33)sub n3=t[6],n3 }
{ .mmi; (p35)sub n3=t[6],n3,1;;
(p33)cmp.gtu p34,p32=n3,t[6]
(p35)cmp.geu p34,p32=n3,t[6] };;
.pred.rel "mutex",p32,p34
{ .mii; (p32)sub n4=t[5],n4
(p34)sub n4=t[5],n4,1;;
(p32)cmp.gtu p35,p33=n4,t[5] }
{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
.pred.rel "mutex",p33,p35
(p33)sub n5=t[4],n5
(p35)sub n5=t[4],n5,1 };;
{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
(p35)cmp.geu p34,p32=n5,t[4];;
.pred.rel "mutex",p32,p34
(p32)sub n6=t[3],n6 }
{ .mmi; (p34)sub n6=t[3],n6,1;;
(p32)cmp.gtu p35,p33=n6,t[3]
(p34)cmp.geu p35,p33=n6,t[3] };;
.pred.rel "mutex",p33,p35
{ .mii; (p33)sub n7=t[2],n7
(p35)sub n7=t[2],n7,1;;
(p33)cmp.gtu p34,p32=n7,t[2] }
{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
.pred.rel "mutex",p32,p34
(p32)sub n8=t[1],n8
(p34)sub n8=t[1],n8,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
(p34)cmp.geu p35,p33=n8,t[1];;
.pred.rel "mutex",p33,p35
(p33)sub a8=t[0],r0 }
{ .mmi; (p35)sub a8=t[0],r0,1;;
(p33)cmp.gtu p34,p32=a8,t[0]
(p35)cmp.geu p34,p32=a8,t[0] };;
// save the result, either tmp[num] or tmp[num]-np[num]
.pred.rel "mutex",p32,p34
{ .mmi; (p32)st8 [rptr]=n1,8
(p34)st8 [rptr]=t0,8
add r19=-4*16,prevsp};;
{ .mmb; (p32)st8 [rptr]=n2,8
(p34)st8 [rptr]=t[7],8
(p5)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n3,8
(p34)st8 [rptr]=t[6],8
(p7)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n4,8
(p34)st8 [rptr]=t[5],8
(p9)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n5,8
(p34)st8 [rptr]=t[4],8
(p11)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n6,8
(p34)st8 [rptr]=t[3],8
(p13)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n7,8
(p34)st8 [rptr]=t[2],8
(p15)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n8,8
(p34)st8 [rptr]=t[1],8
nop.b 0 };;
.Ldone: // epilogue
{ .mmi; ldf.fill f16=[r16],64
ldf.fill f17=[r17],64
nop.i 0 }
{ .mmi; ldf.fill f18=[r18],64
ldf.fill f19=[r19],64
mov pr=prevpr,0x1ffff };;
{ .mmi; ldf.fill f20=[r16]
ldf.fill f21=[r17]
mov ar.lc=prevlc }
{ .mmi; ldf.fill f22=[r18]
ldf.fill f23=[r19]
mov ret0=1 } // signal "handled"
{ .mib; rum 1<<5
.restore sp
mov sp=prevsp
br.ret.sptk.many b0 };;
.endp bn_mul_mont_8#
.type copyright#,\@object
copyright:
stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,426 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# This module doesn't present direct interest for OpenSSL, because it
# doesn't provide better performance for longer keys, at least not on
# in-order-execution cores. While 512-bit RSA sign operations can be
# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
# verify:-( All comparisons are against bn_mul_mont-free assembler.
# The module might be of interest to embedded system developers, as
# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
# code.
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp;
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
$PTR_SUB="dsub"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$SZREG=8;
} else {
$PTR_ADD="add";
$PTR_SUB="sub";
$REG_S="sw";
$REG_L="lw";
$SZREG=4;
}
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
#
# <appro@openssl.org>
#
######################################################################
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
if ($flavour =~ /64|n32/i) {
$LD="ld";
$ST="sd";
$MULTU="dmultu";
$ADDU="daddu";
$SUBU="dsubu";
$BNSZ=8;
} else {
$LD="lw";
$ST="sw";
$MULTU="multu";
$ADDU="addu";
$SUBU="subu";
$BNSZ=4;
}
# int bn_mul_mont(
$rp=$a0; # BN_ULONG *rp,
$ap=$a1; # const BN_ULONG *ap,
$bp=$a2; # const BN_ULONG *bp,
$np=$a3; # const BN_ULONG *np,
$n0=$a4; # const BN_ULONG *n0,
$num=$a5; # int num);
$lo0=$a6;
$hi0=$a7;
$lo1=$t1;
$hi1=$t2;
$aj=$s0;
$bi=$s1;
$nj=$s2;
$tp=$s3;
$alo=$s4;
$ahi=$s5;
$nlo=$s6;
$nhi=$s7;
$tj=$s8;
$i=$s9;
$j=$s10;
$m1=$s11;
$FRAMESIZE=14;
$code=<<___;
.text
.set noat
.set noreorder
.align 5
.globl bn_mul_mont
.ent bn_mul_mont
bn_mul_mont:
___
$code.=<<___ if ($flavour =~ /o32/i);
lw $n0,16($sp)
lw $num,20($sp)
___
$code.=<<___;
slt $at,$num,4
bnez $at,1f
li $t0,0
slt $at,$num,17 # on in-order CPU
bnez $at,bn_mul_mont_internal
nop
1: jr $ra
li $a0,0
.end bn_mul_mont
.align 5
.ent bn_mul_mont_internal
bn_mul_mont_internal:
.frame $fp,$FRAMESIZE*$SZREG,$ra
.mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
$PTR_SUB $sp,$FRAMESIZE*$SZREG
$REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
$REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
$REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
$REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
$REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
$REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
$REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
$REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
$REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
$REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
$REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
$REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
___
$code.=<<___;
move $fp,$sp
.set reorder
$LD $n0,0($n0)
$LD $bi,0($bp) # bp[0]
$LD $aj,0($ap) # ap[0]
$LD $nj,0($np) # np[0]
$PTR_SUB $sp,2*$BNSZ # place for two extra words
sll $num,`log($BNSZ)/log(2)`
li $at,-4096
$PTR_SUB $sp,$num
and $sp,$at
$MULTU $aj,$bi
$LD $alo,$BNSZ($ap)
$LD $nlo,$BNSZ($np)
mflo $lo0
mfhi $hi0
$MULTU $lo0,$n0
mflo $m1
$MULTU $alo,$bi
mflo $alo
mfhi $ahi
$MULTU $nj,$m1
mflo $lo1
mfhi $hi1
$MULTU $nlo,$m1
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
mflo $nlo
mfhi $nhi
move $tp,$sp
li $j,2*$BNSZ
.align 4
.L1st:
.set noreorder
$PTR_ADD $aj,$ap,$j
$PTR_ADD $nj,$np,$j
$LD $aj,($aj)
$LD $nj,($nj)
$MULTU $aj,$bi
$ADDU $lo0,$alo,$hi0
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo0,$hi0
sltu $t0,$lo1,$hi1
$ADDU $hi0,$ahi,$at
$ADDU $hi1,$nhi,$t0
mflo $alo
mfhi $ahi
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$MULTU $nj,$m1
$ADDU $hi1,$at
addu $j,$BNSZ
$ST $lo1,($tp)
sltu $t0,$j,$num
mflo $nlo
mfhi $nhi
bnez $t0,.L1st
$PTR_ADD $tp,$BNSZ
.set reorder
$ADDU $lo0,$alo,$hi0
sltu $at,$lo0,$hi0
$ADDU $hi0,$ahi,$at
$ADDU $lo1,$nlo,$hi1
sltu $t0,$lo1,$hi1
$ADDU $hi1,$nhi,$t0
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
$ST $lo1,($tp)
$ADDU $hi1,$hi0
sltu $at,$hi1,$hi0
$ST $hi1,$BNSZ($tp)
$ST $at,2*$BNSZ($tp)
li $i,$BNSZ
.align 4
.Louter:
$PTR_ADD $bi,$bp,$i
$LD $bi,($bi)
$LD $aj,($ap)
$LD $alo,$BNSZ($ap)
$LD $tj,($sp)
$MULTU $aj,$bi
$LD $nj,($np)
$LD $nlo,$BNSZ($np)
mflo $lo0
mfhi $hi0
$ADDU $lo0,$tj
$MULTU $lo0,$n0
sltu $at,$lo0,$tj
$ADDU $hi0,$at
mflo $m1
$MULTU $alo,$bi
mflo $alo
mfhi $ahi
$MULTU $nj,$m1
mflo $lo1
mfhi $hi1
$MULTU $nlo,$m1
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
mflo $nlo
mfhi $nhi
move $tp,$sp
li $j,2*$BNSZ
$LD $tj,$BNSZ($tp)
.align 4
.Linner:
.set noreorder
$PTR_ADD $aj,$ap,$j
$PTR_ADD $nj,$np,$j
$LD $aj,($aj)
$LD $nj,($nj)
$MULTU $aj,$bi
$ADDU $lo0,$alo,$hi0
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo0,$hi0
sltu $t0,$lo1,$hi1
$ADDU $hi0,$ahi,$at
$ADDU $hi1,$nhi,$t0
mflo $alo
mfhi $ahi
$ADDU $lo0,$tj
addu $j,$BNSZ
$MULTU $nj,$m1
sltu $at,$lo0,$tj
$ADDU $lo1,$lo0
$ADDU $hi0,$at
sltu $t0,$lo1,$lo0
$LD $tj,2*$BNSZ($tp)
$ADDU $hi1,$t0
sltu $at,$j,$num
mflo $nlo
mfhi $nhi
$ST $lo1,($tp)
bnez $at,.Linner
$PTR_ADD $tp,$BNSZ
.set reorder
$ADDU $lo0,$alo,$hi0
sltu $at,$lo0,$hi0
$ADDU $hi0,$ahi,$at
$ADDU $lo0,$tj
sltu $t0,$lo0,$tj
$ADDU $hi0,$t0
$LD $tj,2*$BNSZ($tp)
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo1,$hi1
$ADDU $hi1,$nhi,$at
$ADDU $lo1,$lo0
sltu $t0,$lo1,$lo0
$ADDU $hi1,$t0
$ST $lo1,($tp)
$ADDU $lo1,$hi1,$hi0
sltu $hi1,$lo1,$hi0
$ADDU $lo1,$tj
sltu $at,$lo1,$tj
$ADDU $hi1,$at
$ST $lo1,$BNSZ($tp)
$ST $hi1,2*$BNSZ($tp)
addu $i,$BNSZ
sltu $t0,$i,$num
bnez $t0,.Louter
.set noreorder
$PTR_ADD $tj,$sp,$num # &tp[num]
move $tp,$sp
move $ap,$sp
li $hi0,0 # clear borrow bit
.align 4
.Lsub: $LD $lo0,($tp)
$LD $lo1,($np)
$PTR_ADD $tp,$BNSZ
$PTR_ADD $np,$BNSZ
$SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
sgtu $at,$lo1,$lo0
$SUBU $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
$ST $lo0,($rp)
or $hi0,$at
sltu $at,$tp,$tj
bnez $at,.Lsub
$PTR_ADD $rp,$BNSZ
$SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
move $tp,$sp
$PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
and $ap,$hi0,$sp
and $bp,$hi1,$rp
or $ap,$ap,$bp # ap=borrow?tp:rp
.align 4
.Lcopy: $LD $aj,($ap)
$PTR_ADD $ap,$BNSZ
$ST $zero,($tp)
$PTR_ADD $tp,$BNSZ
sltu $at,$tp,$tj
$ST $aj,($rp)
bnez $at,.Lcopy
$PTR_ADD $rp,$BNSZ
li $a0,1
li $t0,1
.set noreorder
move $sp,$fp
$REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
$REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
$REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
$REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
$REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
$REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
$REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
$REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
$REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
$REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
$REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
$REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE*$SZREG
.end bn_mul_mont_internal
.rdata
.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,327 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# This module doesn't present direct interest for OpenSSL, because it
# doesn't provide better performance for longer keys. While 512-bit
# RSA private key operations are 40% faster, 1024-bit ones are hardly
# faster at all, while longer key operations are slower by up to 20%.
# It might be of interest to embedded system developers though, as
# it's smaller than 1KB, yet offers ~3x improvement over compiler
# generated code.
#
# The module targets N32 and N64 MIPS ABIs and currently is a bit
# IRIX-centric, i.e. is likely to require adaptation for other OSes.
# int bn_mul_mont(
$rp="a0"; # BN_ULONG *rp,
$ap="a1"; # const BN_ULONG *ap,
$bp="a2"; # const BN_ULONG *bp,
$np="a3"; # const BN_ULONG *np,
$n0="a4"; # const BN_ULONG *n0,
$num="a5"; # int num);
$lo0="a6";
$hi0="a7";
$lo1="v0";
$hi1="v1";
$aj="t0";
$bi="t1";
$nj="t2";
$tp="t3";
$alo="s0";
$ahi="s1";
$nlo="s2";
$nhi="s3";
$tj="s4";
$i="s5";
$j="s6";
$fp="t8";
$m1="t9";
$FRAME=8*(2+8);
$code=<<___;
#include <asm.h>
#include <regdef.h>
.text
.set noat
.set reorder
.align 5
.globl bn_mul_mont
.ent bn_mul_mont
bn_mul_mont:
.set noreorder
PTR_SUB sp,64
move $fp,sp
.frame $fp,64,ra
slt AT,$num,4
li v0,0
beqzl AT,.Lproceed
nop
jr ra
PTR_ADD sp,$fp,64
.set reorder
.align 5
.Lproceed:
ld $n0,0($n0)
ld $bi,0($bp) # bp[0]
ld $aj,0($ap) # ap[0]
ld $nj,0($np) # np[0]
PTR_SUB sp,16 # place for two extra words
sll $num,3
li AT,-4096
PTR_SUB sp,$num
and sp,AT
sd s0,0($fp)
sd s1,8($fp)
sd s2,16($fp)
sd s3,24($fp)
sd s4,32($fp)
sd s5,40($fp)
sd s6,48($fp)
sd s7,56($fp)
dmultu $aj,$bi
ld $alo,8($ap)
ld $nlo,8($np)
mflo $lo0
mfhi $hi0
dmultu $lo0,$n0
mflo $m1
dmultu $alo,$bi
mflo $alo
mfhi $ahi
dmultu $nj,$m1
mflo $lo1
mfhi $hi1
dmultu $nlo,$m1
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
daddu $hi1,AT
mflo $nlo
mfhi $nhi
move $tp,sp
li $j,16
.align 4
.L1st:
.set noreorder
PTR_ADD $aj,$ap,$j
ld $aj,($aj)
PTR_ADD $nj,$np,$j
ld $nj,($nj)
dmultu $aj,$bi
daddu $lo0,$alo,$hi0
daddu $lo1,$nlo,$hi1
sltu AT,$lo0,$hi0
sltu s7,$lo1,$hi1
daddu $hi0,$ahi,AT
daddu $hi1,$nhi,s7
mflo $alo
mfhi $ahi
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
dmultu $nj,$m1
daddu $hi1,AT
addu $j,8
sd $lo1,($tp)
sltu s7,$j,$num
mflo $nlo
mfhi $nhi
bnez s7,.L1st
PTR_ADD $tp,8
.set reorder
daddu $lo0,$alo,$hi0
sltu AT,$lo0,$hi0
daddu $hi0,$ahi,AT
daddu $lo1,$nlo,$hi1
sltu s7,$lo1,$hi1
daddu $hi1,$nhi,s7
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
daddu $hi1,AT
sd $lo1,($tp)
daddu $hi1,$hi0
sltu AT,$hi1,$hi0
sd $hi1,8($tp)
sd AT,16($tp)
li $i,8
.align 4
.Louter:
PTR_ADD $bi,$bp,$i
ld $bi,($bi)
ld $aj,($ap)
ld $alo,8($ap)
ld $tj,(sp)
dmultu $aj,$bi
ld $nj,($np)
ld $nlo,8($np)
mflo $lo0
mfhi $hi0
daddu $lo0,$tj
dmultu $lo0,$n0
sltu AT,$lo0,$tj
daddu $hi0,AT
mflo $m1
dmultu $alo,$bi
mflo $alo
mfhi $ahi
dmultu $nj,$m1
mflo $lo1
mfhi $hi1
dmultu $nlo,$m1
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
daddu $hi1,AT
mflo $nlo
mfhi $nhi
move $tp,sp
li $j,16
ld $tj,8($tp)
.align 4
.Linner:
.set noreorder
PTR_ADD $aj,$ap,$j
ld $aj,($aj)
PTR_ADD $nj,$np,$j
ld $nj,($nj)
dmultu $aj,$bi
daddu $lo0,$alo,$hi0
daddu $lo1,$nlo,$hi1
sltu AT,$lo0,$hi0
sltu s7,$lo1,$hi1
daddu $hi0,$ahi,AT
daddu $hi1,$nhi,s7
mflo $alo
mfhi $ahi
daddu $lo0,$tj
addu $j,8
dmultu $nj,$m1
sltu AT,$lo0,$tj
daddu $lo1,$lo0
daddu $hi0,AT
sltu s7,$lo1,$lo0
ld $tj,16($tp)
daddu $hi1,s7
sltu AT,$j,$num
mflo $nlo
mfhi $nhi
sd $lo1,($tp)
bnez AT,.Linner
PTR_ADD $tp,8
.set reorder
daddu $lo0,$alo,$hi0
sltu AT,$lo0,$hi0
daddu $hi0,$ahi,AT
daddu $lo0,$tj
sltu s7,$lo0,$tj
daddu $hi0,s7
ld $tj,16($tp)
daddu $lo1,$nlo,$hi1
sltu AT,$lo1,$hi1
daddu $hi1,$nhi,AT
daddu $lo1,$lo0
sltu s7,$lo1,$lo0
daddu $hi1,s7
sd $lo1,($tp)
daddu $lo1,$hi1,$hi0
sltu $hi1,$lo1,$hi0
daddu $lo1,$tj
sltu AT,$lo1,$tj
daddu $hi1,AT
sd $lo1,8($tp)
sd $hi1,16($tp)
addu $i,8
sltu s7,$i,$num
bnez s7,.Louter
.set noreorder
PTR_ADD $tj,sp,$num # &tp[num]
move $tp,sp
move $ap,sp
li $hi0,0 # clear borrow bit
.align 4
.Lsub: ld $lo0,($tp)
ld $lo1,($np)
PTR_ADD $tp,8
PTR_ADD $np,8
dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
sgtu AT,$lo1,$lo0
dsubu $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
sd $lo0,($rp)
or $hi0,AT
sltu AT,$tp,$tj
bnez AT,.Lsub
PTR_ADD $rp,8
dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
move $tp,sp
PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
and $ap,$hi0,sp
and $bp,$hi1,$rp
or $ap,$ap,$bp # ap=borrow?tp:rp
.align 4
.Lcopy: ld $aj,($ap)
PTR_ADD $ap,8
PTR_ADD $tp,8
sd zero,-8($tp)
sltu AT,$tp,$tj
sd $aj,($rp)
bnez AT,.Lcopy
PTR_ADD $rp,8
ld s0,0($fp)
ld s1,8($fp)
ld s2,16($fp)
ld s3,24($fp)
ld s4,32($fp)
ld s5,40($fp)
ld s6,48($fp)
ld s7,56($fp)
li v0,1
jr ra
PTR_ADD sp,$fp,64
.set reorder
END(bn_mul_mont)
.rdata
.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,995 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# On PA-7100LC this module performs ~90-50% better, less for longer
# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
# that compiler utilized xmpyu instruction to perform 32x32=64-bit
# multiplication, which in turn means that "baseline" performance was
# optimal in respect to instruction set capabilities. Fair comparison
# with vendor compiler is problematic, because OpenSSL doesn't define
# BN_LLONG [presumably] for historical reasons, which drives compiler
# toward 4 times 16x16=32-bit multiplicatons [plus complementary
# shifts and additions] instead. This means that you should observe
# several times improvement over code generated by vendor compiler
# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
# improvement coefficient was never collected on PA-7100LC, or any
# other 1.1 CPU, because I don't have access to such machine with
# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
# of ~5x on PA-8600.
#
# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
# reportedly ~2x faster than vendor compiler generated code [according
# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
# this implementation is actually 32-bit one, in the sense that it
# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
# 64-bit BN_LONGs... How do they interoperate then? No problem. This
# module picks halves of 64-bit values in reverse order and pretends
# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
# i.e. there is no "wider" multiplication like on most other 64-bit
# platforms. This means that even being effectively 32-bit, this
# implementation performs "64-bit" computational task in same amount
# of arithmetic operations, most notably multiplications. It requires
# more memory references, most notably to tp[num], but this doesn't
# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
# 2.0 code path provides virtually same performance as pa-risc2[W].s:
# it's ~10% better for shortest key length and ~10% worse for longest
# one.
#
# In case it wasn't clear. The module has two distinct code paths:
# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
# additions and 64-bit integer loads, not to mention specific
# instruction scheduling. In 64-bit build naturally only 2.0 code path
# is assembled. In 32-bit application context both code paths are
# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
# is taken automatically. Also, in 32-bit build the module imposes
# couple of limitations: vector lengths has to be even and vector
# addresses has to be 64-bit aligned. Normally neither is a problem:
# most common key lengths are even and vectors are commonly malloc-ed,
# which ensures alignment.
#
# Special thanks to polarhome.com for providing HP-UX account on
# PA-RISC 1.1 machine, and to correspondent who chose to remain
# anonymous for testing the code on PA-RISC 2.0 machine.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
$BN_SZ =$SIZE_T;
} else {
$LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
$BN_SZ =$SIZE_T;
if (open CONF,"<${dir}../../opensslconf.h") {
while(<CONF>) {
if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
$BN_SZ=8;
$LEVEL="2.0";
last;
}
}
close CONF;
}
}
$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
# [+ argument transfer]
$LOCALS=$FRAME-$FRAME_MARKER;
$FRAME+=32; # local variables
$tp="%r31";
$ti1="%r29";
$ti0="%r28";
$rp="%r26";
$ap="%r25";
$bp="%r24";
$np="%r23";
$n0="%r22"; # passed through stack in 32-bit
$num="%r21"; # passed through stack in 32-bit
$idx="%r20";
$arrsz="%r19";
$nm1="%r7";
$nm0="%r6";
$ab1="%r5";
$ab0="%r4";
$fp="%r3";
$hi1="%r2";
$hi0="%r1";
$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
$fm0="%fr4"; $fti=$fm0;
$fbi="%fr5L";
$fn0="%fr5R";
$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
.ALIGN 64
bn_mul_mont
.PROC
.CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
ldo -$FRAME(%sp),$fp
___
$code.=<<___ if ($SIZE_T==4);
ldw `-$FRAME_MARKER-4`($fp),$n0
ldw `-$FRAME_MARKER-8`($fp),$num
nop
nop ; alignment
___
$code.=<<___ if ($BN_SZ==4);
comiclr,<= 6,$num,%r0 ; are vectors long enough?
b L\$abort
ldi 0,%r28 ; signal "unhandled"
add,ev %r0,$num,$num ; is $num even?
b L\$abort
nop
or $ap,$np,$ti1
extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
b L\$abort
nop
nop ; alignment
nop
fldws 0($n0),${fn0}
fldws,ma 4($bp),${fbi} ; bp[0]
___
$code.=<<___ if ($BN_SZ==8);
comib,> 3,$num,L\$abort ; are vectors long enough?
ldi 0,%r28 ; signal "unhandled"
addl $num,$num,$num ; I operate on 32-bit values
fldws 4($n0),${fn0} ; only low part of n0
fldws 4($bp),${fbi} ; bp[0] in flipped word order
___
$code.=<<___;
fldds 0($ap),${fai} ; ap[0,1]
fldds 0($np),${fni} ; np[0,1]
sh2addl $num,%r0,$arrsz
ldi 31,$hi0
ldo 36($arrsz),$hi1 ; space for tp[num+1]
andcm $hi1,$hi0,$hi1 ; align
addl $hi1,%sp,%sp
$PUSH $fp,-$SIZE_T(%sp)
ldo `$LOCALS+16`($fp),$xfer
ldo `$LOCALS+32+4`($fp),$tp
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
xmpyu ${fn0},${fab0}R,${fm0}
addl $arrsz,$ap,$ap ; point at the end
addl $arrsz,$np,$np
subi 0,$arrsz,$idx ; j=0
ldo 8($idx),$idx ; j++++
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
fstds ${fab1},0($xfer)
fstds ${fnm1},8($xfer)
flddx $idx($ap),${fai} ; ap[2,3]
flddx $idx($np),${fni} ; np[2,3]
___
$code.=<<___ if ($BN_SZ==4);
mtctl $hi0,%cr11 ; $hi0 still holds 31
extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
b L\$parisc11
nop
___
$code.=<<___; # PA-RISC 2.0 code-path
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
extrd,u $ab0,31,32,$hi0
extrd,u $ab0,63,32,$ab0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
ldo 8($idx),$idx ; j++++
addl $ab0,$nm0,$nm0 ; low part is discarded
extrd,u $nm0,31,32,$hi1
L\$1st
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,63,32,$ab1
addl $hi1,$nm1,$nm1
flddx $idx($ap),${fai} ; ap[j,j+1]
flddx $idx($np),${fni} ; np[j,j+1]
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
addl $hi0,$ab0,$ab0
extrd,u $ab0,31,32,$hi0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
stw $nm1,-4($tp) ; tp[j-1]
addl $ab0,$nm0,$nm0
stw,ma $nm0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$1st ; j++++
extrd,u $nm0,31,32,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,63,32,$ab1
addl $hi1,$nm1,$nm1
ldd -16($xfer),$ab0
addl $ab1,$nm1,$nm1
ldd -8($xfer),$nm0
extrd,u $nm1,31,32,$hi1
addl $hi0,$ab0,$ab0
extrd,u $ab0,31,32,$hi0
stw $nm1,-4($tp) ; tp[j-1]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
ldd 0($xfer),$ab1
addl $ab0,$nm0,$nm0
ldd,mb 8($xfer),$nm1
extrd,u $nm0,31,32,$hi1
stw,ma $nm0,8($tp) ; tp[j-1]
ldo -1($num),$num ; i--
subi 0,$arrsz,$idx ; j=0
___
$code.=<<___ if ($BN_SZ==4);
fldws,ma 4($bp),${fbi} ; bp[1]
___
$code.=<<___ if ($BN_SZ==8);
fldws 0($bp),${fbi} ; bp[1] in flipped word order
___
$code.=<<___;
flddx $idx($ap),${fai} ; ap[0,1]
flddx $idx($np),${fni} ; np[0,1]
fldws 8($xfer),${fti}R ; tp[0]
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
stw $nm1,-4($tp) ; tp[j-1]
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
addl $hi1,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
stw $hi0,0($tp)
stw $hi1,4($tp)
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
xmpyu ${fn0},${fab0}R,${fm0}
ldo `$LOCALS+32+4`($fp),$tp
L\$outer
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer) ; 33-bit value
fstds ${fnm0},-8($xfer)
flddx $idx($ap),${fai} ; ap[2]
flddx $idx($np),${fni} ; np[2]
ldo 8($idx),$idx ; j++++
ldd -16($xfer),$ab0 ; 33-bit value
ldd -8($xfer),$nm0
ldw 0($xfer),$hi0 ; high part
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
extrd,u $ab0,31,32,$ti0 ; carry bit
extrd,u $ab0,63,32,$ab0
fstds ${fab1},0($xfer)
addl $ti0,$hi0,$hi0 ; account carry bit
fstds ${fnm1},8($xfer)
addl $ab0,$nm0,$nm0 ; low part is discarded
ldw 0($tp),$ti1 ; tp[1]
extrd,u $nm0,31,32,$hi1
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
L\$inner
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ti1,$ti1
addl $ti1,$ab1,$ab1
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
flddx $idx($ap),${fai} ; ap[j,j+1]
flddx $idx($np),${fni} ; np[j,j+1]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
ldw 4($tp),$ti0 ; tp[j]
stw $nm1,-4($tp) ; tp[j-1]
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
addl $hi0,$ti0,$ti0
addl $ti0,$ab0,$ab0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
extrd,u $ab0,31,32,$hi0
extrd,u $nm1,31,32,$hi1
ldw 8($tp),$ti1 ; tp[j]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
addl $ab0,$nm0,$nm0
stw,ma $nm0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$inner ; j++++
extrd,u $nm0,31,32,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ti1,$ti1
addl $ti1,$ab1,$ab1
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
ldd -16($xfer),$ab0
ldd -8($xfer),$nm0
extrd,u $nm1,31,32,$hi1
addl $hi0,$ab0,$ab0
addl $ti0,$ab0,$ab0
stw $nm1,-4($tp) ; tp[j-1]
extrd,u $ab0,31,32,$hi0
ldw 8($tp),$ti1 ; tp[j]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
ldd 0($xfer),$ab1
addl $ab0,$nm0,$nm0
ldd,mb 8($xfer),$nm1
extrd,u $nm0,31,32,$hi1
stw,ma $nm0,8($tp) ; tp[j-1]
addib,= -1,$num,L\$outerdone ; i--
subi 0,$arrsz,$idx ; j=0
___
$code.=<<___ if ($BN_SZ==4);
fldws,ma 4($bp),${fbi} ; bp[i]
___
$code.=<<___ if ($BN_SZ==8);
ldi 12,$ti0 ; bp[i] in flipped word order
addl,ev %r0,$num,$num
ldi -4,$ti0
addl $ti0,$bp,$bp
fldws 0($bp),${fbi}
___
$code.=<<___;
flddx $idx($ap),${fai} ; ap[0]
addl $hi0,$ab1,$ab1
flddx $idx($np),${fni} ; np[0]
fldws 8($xfer),${fti}R ; tp[0]
addl $ti1,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
fstws,mb ${fab0}L,-8($xfer) ; save high part
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
stw $nm1,-4($tp) ; tp[j-1]
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
addl $hi1,$hi0,$hi0
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
addl $ti0,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
stw $hi0,0($tp)
stw $hi1,4($tp)
xmpyu ${fn0},${fab0}R,${fm0}
b L\$outer
ldo `$LOCALS+32+4`($fp),$tp
L\$outerdone
addl $hi0,$ab1,$ab1
addl $ti1,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
stw $nm1,-4($tp) ; tp[j-1]
addl $hi1,$hi0,$hi0
addl $ti0,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
stw $hi0,0($tp)
stw $hi1,4($tp)
ldo `$LOCALS+32`($fp),$tp
sub %r0,%r0,%r0 ; clear borrow
___
$code.=<<___ if ($BN_SZ==4);
ldws,ma 4($tp),$ti0
extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
b L\$sub_pa11
addl $tp,$arrsz,$tp
L\$sub
ldwx $idx($np),$hi0
subb $ti0,$hi0,$hi1
ldwx $idx($tp),$ti0
addib,<> 4,$idx,L\$sub
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
ldo -4($tp),$tp
___
$code.=<<___ if ($BN_SZ==8);
ldd,ma 8($tp),$ti0
L\$sub
ldd $idx($np),$hi0
shrpd $ti0,$ti0,32,$ti0 ; flip word order
std $ti0,-8($tp) ; save flipped value
sub,db $ti0,$hi0,$hi1
ldd,ma 8($tp),$ti0
addib,<> 8,$idx,L\$sub
std,ma $hi1,8($rp)
extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
sub,db $ti0,%r0,$hi1
ldo -8($tp),$tp
___
$code.=<<___;
and $tp,$hi1,$ap
andcm $rp,$hi1,$bp
or $ap,$bp,$np
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
ldo `$LOCALS+32`($fp),$tp
L\$copy
ldd $idx($np),$hi0
std,ma %r0,8($tp)
addib,<> 8,$idx,.-8 ; L\$copy
std,ma $hi0,8($rp)
___
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
$ablo=$ab0;
$abhi=$ab1;
$nmlo0=$nm0;
$nmhi0=$nm1;
$nmlo1="%r9";
$nmhi1="%r8";
$code.=<<___;
b L\$done
nop
.ALIGN 8
L\$parisc11
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -12($xfer),$ablo
ldw -16($xfer),$hi0
ldw -4($xfer),$nmlo0
ldw -8($xfer),$nmhi0
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
ldo 8($idx),$idx ; j++++
add $ablo,$nmlo0,$nmlo0 ; discarded
addc %r0,$nmhi0,$hi1
ldw 4($xfer),$ablo
ldw 0($xfer),$abhi
nop
L\$1st_pa11
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
flddx $idx($ap),${fai} ; ap[j,j+1]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
flddx $idx($np),${fni} ; np[j,j+1]
add $hi0,$ablo,$ablo
ldw 12($xfer),$nmlo1
addc %r0,$abhi,$hi0
ldw 8($xfer),$nmhi1
add $ablo,$nmlo1,$nmlo1
fstds ${fab1},0($xfer)
addc %r0,$nmhi1,$nmhi1
fstds ${fnm1},8($xfer)
add $hi1,$nmlo1,$nmlo1
ldw -12($xfer),$ablo
addc %r0,$nmhi1,$hi1
ldw -16($xfer),$abhi
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
ldw -4($xfer),$nmlo0
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -8($xfer),$nmhi0
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$hi0
fstds ${fab0},-16($xfer)
add $ablo,$nmlo0,$nmlo0
fstds ${fnm0},-8($xfer)
addc %r0,$nmhi0,$nmhi0
ldw 0($xfer),$abhi
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$1st_pa11 ; j++++
addc %r0,$nmhi0,$hi1
ldw 8($xfer),$nmhi1
ldw 12($xfer),$nmlo1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
add $hi0,$ablo,$ablo
fstds ${fab1},0($xfer)
addc %r0,$abhi,$hi0
fstds ${fnm1},8($xfer)
add $ablo,$nmlo1,$nmlo1
ldw -16($xfer),$abhi
addc %r0,$nmhi1,$nmhi1
ldw -12($xfer),$ablo
add $hi1,$nmlo1,$nmlo1
ldw -8($xfer),$nmhi0
addc %r0,$nmhi1,$hi1
ldw -4($xfer),$nmlo0
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$hi0
ldw 0($xfer),$abhi
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldws,mb 8($xfer),$nmhi1
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$nmlo1
addc %r0,$nmhi0,$hi1
stws,ma $nmlo0,8($tp) ; tp[j-1]
ldo -1($num),$num ; i--
subi 0,$arrsz,$idx ; j=0
fldws,ma 4($bp),${fbi} ; bp[1]
flddx $idx($ap),${fai} ; ap[0,1]
flddx $idx($np),${fni} ; np[0,1]
fldws 8($xfer),${fti}R ; tp[0]
add $hi0,$ablo,$ablo
addc %r0,$abhi,$hi0
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
stw $nmlo1,-4($tp) ; tp[j-1]
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
stw $hi0,0($tp)
stw $hi1,4($tp)
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
xmpyu ${fn0},${fab0}R,${fm0}
ldo `$LOCALS+32+4`($fp),$tp
L\$outer_pa11
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer) ; 33-bit value
fstds ${fnm0},-8($xfer)
flddx $idx($ap),${fai} ; ap[2,3]
flddx $idx($np),${fni} ; np[2,3]
ldw -16($xfer),$abhi ; carry bit actually
ldo 8($idx),$idx ; j++++
ldw -12($xfer),$ablo
ldw -8($xfer),$nmhi0
ldw -4($xfer),$nmlo0
ldw 0($xfer),$hi0 ; high part
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
fstds ${fab1},0($xfer)
addl $abhi,$hi0,$hi0 ; account carry bit
fstds ${fnm1},8($xfer)
add $ablo,$nmlo0,$nmlo0 ; discarded
ldw 0($tp),$ti1 ; tp[1]
addc %r0,$nmhi0,$hi1
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
ldw 4($xfer),$ablo
ldw 0($xfer),$abhi
L\$inner_pa11
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
flddx $idx($ap),${fai} ; ap[j,j+1]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
flddx $idx($np),${fni} ; np[j,j+1]
add $hi0,$ablo,$ablo
ldw 4($tp),$ti0 ; tp[j]
addc %r0,$abhi,$abhi
ldw 12($xfer),$nmlo1
add $ti1,$ablo,$ablo
ldw 8($xfer),$nmhi1
addc %r0,$abhi,$hi0
fstds ${fab1},0($xfer)
add $ablo,$nmlo1,$nmlo1
fstds ${fnm1},8($xfer)
addc %r0,$nmhi1,$nmhi1
ldw -12($xfer),$ablo
add $hi1,$nmlo1,$nmlo1
ldw -16($xfer),$abhi
addc %r0,$nmhi1,$hi1
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
ldw 8($tp),$ti1 ; tp[j]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -4($xfer),$nmlo0
add $hi0,$ablo,$ablo
ldw -8($xfer),$nmhi0
addc %r0,$abhi,$abhi
stw $nmlo1,-4($tp) ; tp[j-1]
add $ti0,$ablo,$ablo
fstds ${fab0},-16($xfer)
addc %r0,$abhi,$hi0
fstds ${fnm0},-8($xfer)
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldw 0($xfer),$abhi
add $hi1,$nmlo0,$nmlo0
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$inner_pa11 ; j++++
addc %r0,$nmhi0,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
ldw 12($xfer),$nmlo1
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldw 8($xfer),$nmhi1
add $hi0,$ablo,$ablo
ldw 4($tp),$ti0 ; tp[j]
addc %r0,$abhi,$abhi
fstds ${fab1},0($xfer)
add $ti1,$ablo,$ablo
fstds ${fnm1},8($xfer)
addc %r0,$abhi,$hi0
ldw -16($xfer),$abhi
add $ablo,$nmlo1,$nmlo1
ldw -12($xfer),$ablo
addc %r0,$nmhi1,$nmhi1
ldw -8($xfer),$nmhi0
add $hi1,$nmlo1,$nmlo1
ldw -4($xfer),$nmlo0
addc %r0,$nmhi1,$hi1
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$abhi
add $ti0,$ablo,$ablo
ldw 8($tp),$ti1 ; tp[j]
addc %r0,$abhi,$hi0
ldw 0($xfer),$abhi
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldws,mb 8($xfer),$nmhi1
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$nmlo1
addc %r0,$nmhi0,$hi1
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,= -1,$num,L\$outerdone_pa11; i--
subi 0,$arrsz,$idx ; j=0
fldws,ma 4($bp),${fbi} ; bp[i]
flddx $idx($ap),${fai} ; ap[0]
add $hi0,$ablo,$ablo
addc %r0,$abhi,$abhi
flddx $idx($np),${fni} ; np[0]
fldws 8($xfer),${fti}R ; tp[0]
add $ti1,$ablo,$ablo
addc %r0,$abhi,$hi0
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
ldw 4($tp),$ti0 ; tp[j]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
stw $nmlo1,-4($tp) ; tp[j-1]
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
add $ti0,$hi0,$hi0
addc %r0,$hi1,$hi1
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
stw $hi0,0($tp)
stw $hi1,4($tp)
xmpyu ${fn0},${fab0}R,${fm0}
b L\$outer_pa11
ldo `$LOCALS+32+4`($fp),$tp
L\$outerdone_pa11
add $hi0,$ablo,$ablo
addc %r0,$abhi,$abhi
add $ti1,$ablo,$ablo
addc %r0,$abhi,$hi0
ldw 4($tp),$ti0 ; tp[j]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
stw $nmlo1,-4($tp) ; tp[j-1]
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
add $ti0,$hi0,$hi0
addc %r0,$hi1,$hi1
stw $hi0,0($tp)
stw $hi1,4($tp)
ldo `$LOCALS+32+4`($fp),$tp
sub %r0,%r0,%r0 ; clear borrow
ldw -4($tp),$ti0
addl $tp,$arrsz,$tp
L\$sub_pa11
ldwx $idx($np),$hi0
subb $ti0,$hi0,$hi1
ldwx $idx($tp),$ti0
addib,<> 4,$idx,L\$sub_pa11
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
ldo -4($tp),$tp
and $tp,$hi1,$ap
andcm $rp,$hi1,$bp
or $ap,$bp,$np
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
ldo `$LOCALS+32`($fp),$tp
L\$copy_pa11
ldwx $idx($np),$hi0
stws,ma %r0,4($tp)
addib,<> 4,$idx,L\$copy_pa11
stws,ma $hi0,4($rp)
nop ; alignment
L\$done
___
}
$code.=<<___;
ldi 1,%r28 ; signal "handled"
ldo $FRAME($fp),%sp ; destroy tp[num+1]
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
L\$abort
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
{ my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $sub = sub {
my ($mod,$args) = @_;
my $orig = "sub$mod\t$args";
if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
$opcode|=(1<<10); # e1
$opcode|=(1<<8); # e2
$opcode|=(1<<5); # d
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
# flip word order in 64-bit mode...
s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
# assemble 2.0 instructions in 32-bit mode...
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
s/\bbv\b/bve/gm if ($SIZE_T==8);
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,335 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2006
# "Teaser" Montgomery multiplication module for PowerPC. It's possible
# to gain a bit more by modulo-scheduling outer loop, then dedicated
# squaring procedure should give further 20% and code can be adapted
# for 32-bit application running on 64-bit CPU. As for the latter.
# It won't be able to achieve "native" 64-bit performance, because in
# 32-bit application context every addc instruction will have to be
# expanded as addc, twice right shift by 32 and finally adde, etc.
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
# for 64-bit application running on PPC970/G5 is:
#
# 512-bit +65%
# 1024-bit +35%
# 2048-bit +18%
# 4096-bit +4%
$flavour = shift;
if ($flavour =~ /32/) {
$BITS= 32;
$BNSZ= $BITS/8;
$SIZE_T=4;
$RZONE= 224;
$LD= "lwz"; # load
$LDU= "lwzu"; # load and update
$LDX= "lwzx"; # load indexed
$ST= "stw"; # store
$STU= "stwu"; # store and update
$STX= "stwx"; # store indexed
$STUX= "stwux"; # store indexed and update
$UMULL= "mullw"; # unsigned multiply low
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
$SHRI= "srwi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} elsif ($flavour =~ /64/) {
$BITS= 64;
$BNSZ= $BITS/8;
$SIZE_T=8;
$RZONE= 288;
# same as above, but 64-bit mnemonics...
$LD= "ld"; # load
$LDU= "ldu"; # load and update
$LDX= "ldx"; # load indexed
$ST= "std"; # store
$STU= "stdu"; # store and update
$STX= "stdx"; # store indexed
$STUX= "stdux"; # store indexed and update
$UMULL= "mulld"; # unsigned multiply low
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
$SHRI= "srdi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} else { die "nonsense $flavour"; }
$FRAME=8*$SIZE_T+$RZONE;
$LOCALS=8*$SIZE_T;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$sp="r1";
$toc="r2";
$rp="r3"; $ovf="r3";
$ap="r4";
$bp="r5";
$np="r6";
$n0="r7";
$num="r8";
$rp="r9"; # $rp is reassigned
$aj="r10";
$nj="r11";
$tj="r12";
# non-volatile registers
$i="r20";
$j="r21";
$tp="r22";
$m0="r23";
$m1="r24";
$lo0="r25";
$hi0="r26";
$lo1="r27";
$hi1="r28";
$alo="r29";
$ahi="r30";
$nlo="r31";
#
$nhi="r0";
$code=<<___;
.machine "any"
.text
.globl .bn_mul_mont_int
.align 4
.bn_mul_mont_int:
cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
li r3,0
bltlr
___
$code.=<<___ if ($BNSZ==4);
cmpwi $num,32 ; longer key performance is not better
bgelr
___
$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096
addi $ovf,$num,$FRAME
subf $ovf,$ovf,$sp ; $sp-$ovf
and $ovf,$ovf,$tj ; minimize TLB usage
subf $ovf,$sp,$ovf ; $ovf-$sp
mr $tj,$sp
srwi $num,$num,`log($BNSZ)/log(2)`
$STUX $sp,$sp,$ovf
$PUSH r20,`-12*$SIZE_T`($tj)
$PUSH r21,`-11*$SIZE_T`($tj)
$PUSH r22,`-10*$SIZE_T`($tj)
$PUSH r23,`-9*$SIZE_T`($tj)
$PUSH r24,`-8*$SIZE_T`($tj)
$PUSH r25,`-7*$SIZE_T`($tj)
$PUSH r26,`-6*$SIZE_T`($tj)
$PUSH r27,`-5*$SIZE_T`($tj)
$PUSH r28,`-4*$SIZE_T`($tj)
$PUSH r29,`-3*$SIZE_T`($tj)
$PUSH r30,`-2*$SIZE_T`($tj)
$PUSH r31,`-1*$SIZE_T`($tj)
$LD $n0,0($n0) ; pull n0[0] value
addi $num,$num,-2 ; adjust $num for counter register
$LD $m0,0($bp) ; m0=bp[0]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$LOCALS
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
$LD $nj,0($np) ; np[0]
$UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
$UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
$UMULH $ahi,$aj,$m0
$UMULL $lo1,$nj,$m1 ; np[0]*m1
$UMULH $hi1,$nj,$m1
$LD $nj,$BNSZ($np) ; np[1]
addc $lo1,$lo1,$lo0
addze $hi1,$hi1
$UMULL $nlo,$nj,$m1 ; np[1]*m1
$UMULH $nhi,$nj,$m1
mtctr $num
li $j,`2*$BNSZ`
.align 4
L1st:
$LDX $aj,$ap,$j ; ap[j]
addc $lo0,$alo,$hi0
$LDX $nj,$np,$j ; np[j]
addze $hi0,$ahi
$UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
addc $lo1,$nlo,$hi1
$UMULH $ahi,$aj,$m0
addze $hi1,$nhi
$UMULL $nlo,$nj,$m1 ; np[j]*m1
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
$UMULH $nhi,$nj,$m1
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
addi $j,$j,$BNSZ ; j++
addi $tp,$tp,$BNSZ ; tp++
bdnz- L1st
;L1st
addc $lo0,$alo,$hi0
addze $hi0,$ahi
addc $lo1,$nlo,$hi1
addze $hi1,$nhi
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
li $ovf,0
addc $hi1,$hi1,$hi0
addze $ovf,$ovf ; upmost overflow bit
$ST $hi1,$BNSZ($tp)
li $i,$BNSZ
.align 4
Louter:
$LDX $m0,$bp,$i ; m0=bp[i]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$LOCALS
$LD $tj,$LOCALS($sp); tp[0]
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
$LD $nj,0($np) ; np[0]
addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
addze $hi0,$hi0
$UMULL $m1,$lo0,$n0 ; tp[0]*n0
$UMULH $ahi,$aj,$m0
$UMULL $lo1,$nj,$m1 ; np[0]*m1
$UMULH $hi1,$nj,$m1
$LD $nj,$BNSZ($np) ; np[1]
addc $lo1,$lo1,$lo0
$UMULL $nlo,$nj,$m1 ; np[1]*m1
addze $hi1,$hi1
$UMULH $nhi,$nj,$m1
mtctr $num
li $j,`2*$BNSZ`
.align 4
Linner:
$LDX $aj,$ap,$j ; ap[j]
addc $lo0,$alo,$hi0
$LD $tj,$BNSZ($tp) ; tp[j]
addze $hi0,$ahi
$LDX $nj,$np,$j ; np[j]
addc $lo1,$nlo,$hi1
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
addze $hi1,$nhi
$UMULH $ahi,$aj,$m0
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
$UMULL $nlo,$nj,$m1 ; np[j]*m1
addze $hi0,$hi0
$UMULH $nhi,$nj,$m1
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
addi $j,$j,$BNSZ ; j++
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
addi $tp,$tp,$BNSZ ; tp++
bdnz- Linner
;Linner
$LD $tj,$BNSZ($tp) ; tp[j]
addc $lo0,$alo,$hi0
addze $hi0,$ahi
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
addze $hi0,$hi0
addc $lo1,$nlo,$hi1
addze $hi1,$nhi
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
li $ovf,0
adde $hi1,$hi1,$hi0
addze $ovf,$ovf
$ST $hi1,$BNSZ($tp)
;
slwi $tj,$num,`log($BNSZ)/log(2)`
$UCMP $i,$tj
addi $i,$i,$BNSZ
ble- Louter
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$LOCALS
mtctr $num
.align 4
Lsub: $LDX $tj,$tp,$j
$LDX $nj,$np,$j
subfe $aj,$nj,$tj ; tp[j]-np[j]
$STX $aj,$rp,$j
addi $j,$j,$BNSZ
bdnz- Lsub
li $j,0
mtctr $num
subfe $ovf,$j,$ovf ; handle upmost overflow bit
and $ap,$tp,$ovf
andc $np,$rp,$ovf
or $ap,$ap,$np ; ap=borrow?tp:rp
.align 4
Lcopy: ; copy or in-place refresh
$LDX $tj,$ap,$j
$STX $tj,$rp,$j
$STX $j,$tp,$j ; zap at once
addi $j,$j,$BNSZ
bdnz- Lcopy
$POP $tj,0($sp)
li r3,1
$POP r20,`-12*$SIZE_T`($tj)
$POP r21,`-11*$SIZE_T`($tj)
$POP r22,`-10*$SIZE_T`($tj)
$POP r23,`-9*$SIZE_T`($tj)
$POP r24,`-8*$SIZE_T`($tj)
$POP r25,`-7*$SIZE_T`($tj)
$POP r26,`-6*$SIZE_T`($tj)
$POP r27,`-5*$SIZE_T`($tj)
$POP r28,`-4*$SIZE_T`($tj)
$POP r29,`-3*$SIZE_T`($tj)
$POP r30,`-2*$SIZE_T`($tj)
$POP r31,`-1*$SIZE_T`($tj)
mr $sp,$tj
blr
.long 0
.byte 0,12,4,0,0x80,12,6,0
.long 0
.size .bn_mul_mont_int,.-.bn_mul_mont_int
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,221 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... gcc 4.3 appeared to generate poor code, therefore
# the effort. And indeed, the module delivers 55%-90%(*) improvement
# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
# This is for 64-bit build. In 32-bit "highgprs" case improvement is
# even higher, for example on z990 it was measured 80%-150%. ECDSA
# sign is modest 9%-12% faster. Keep in mind that these coefficients
# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
# burnt in it...
#
# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
# so that improvement coefficients can vary from one specific
# setup to another.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$stdframe=16*$SIZE_T+4*8;
$rp="%r2";
$a1="%r3";
$a0="%r4";
$b1="%r5";
$b0="%r6";
$ra="%r14";
$sp="%r15";
@T=("%r0","%r1");
@i=("%r12","%r13");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
$code.=<<___;
.text
.type _mul_1x1,\@function
.align 16
_mul_1x1:
lgr $a1,$a
sllg $a2,$a,1
sllg $a4,$a,2
sllg $a8,$a,3
srag $lo,$a1,63 # broadcast 63rd bit
nihh $a1,0x1fff
srag @i[0],$a2,63 # broadcast 62nd bit
nihh $a2,0x3fff
srag @i[1],$a4,63 # broadcast 61st bit
nihh $a4,0x7fff
ngr $lo,$b
ngr @i[0],$b
ngr @i[1],$b
lghi @T[0],0
lgr $a12,$a1
stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
xgr $a12,$a2
stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
lgr $a48,$a4
stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
xgr $a48,$a8
stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
xgr $a1,$a4
stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
xgr $a2,$a4
stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
xgr $a12,$a4
stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
xgr $a1,$a48
stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
xgr $a2,$a48
stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
xgr $a12,$a48
stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
xgr $a1,$a4
stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
xgr $a2,$a4
stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
xgr $a12,$a4
stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
srlg $hi,$lo,1
stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
sllg $lo,$lo,63
stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
srlg @T[0],@i[0],2
stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
lghi $mask,`0xf<<3`
sllg $a1,@i[0],62
sllg @i[0],$b,3
srlg @T[1],@i[1],3
ngr @i[0],$mask
sllg $a2,@i[1],61
srlg @i[1],$b,4-3
xgr $hi,@T[0]
ngr @i[1],$mask
xgr $lo,$a1
xgr $hi,@T[1]
xgr $lo,$a2
xg $lo,$stdframe(@i[0],$sp)
srlg @i[0],$b,8-3
ngr @i[0],$mask
___
for($n=1;$n<14;$n++) {
$code.=<<___;
lg @T[1],$stdframe(@i[1],$sp)
srlg @i[1],$b,`($n+2)*4`-3
sllg @T[0],@T[1],`$n*4`
ngr @i[1],$mask
srlg @T[1],@T[1],`64-$n*4`
xgr $lo,@T[0]
xgr $hi,@T[1]
___
push(@i,shift(@i)); push(@T,shift(@T));
}
$code.=<<___;
lg @T[1],$stdframe(@i[1],$sp)
sllg @T[0],@T[1],`$n*4`
srlg @T[1],@T[1],`64-$n*4`
xgr $lo,@T[0]
xgr $hi,@T[1]
lg @T[0],$stdframe(@i[0],$sp)
sllg @T[1],@T[0],`($n+1)*4`
srlg @T[0],@T[0],`64-($n+1)*4`
xgr $lo,@T[1]
xgr $hi,@T[0]
br $ra
.size _mul_1x1,.-_mul_1x1
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,\@function
.align 16
bn_GF2m_mul_2x2:
stm${g} %r3,%r15,3*$SIZE_T($sp)
lghi %r1,-$stdframe-128
la %r0,0($sp)
la $sp,0(%r1,$sp) # alloca
st${g} %r0,0($sp) # back chain
___
if ($SIZE_T==8) {
my @r=map("%r$_",(6..9));
$code.=<<___;
bras $ra,_mul_1x1 # a1·b1
stmg $lo,$hi,16($rp)
lg $a,`$stdframe+128+4*$SIZE_T`($sp)
lg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # a0·b0
stmg $lo,$hi,0($rp)
lg $a,`$stdframe+128+3*$SIZE_T`($sp)
lg $b,`$stdframe+128+5*$SIZE_T`($sp)
xg $a,`$stdframe+128+4*$SIZE_T`($sp)
xg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
lmg @r[0],@r[3],0($rp)
xgr $lo,$hi
xgr $hi,@r[1]
xgr $lo,@r[0]
xgr $hi,@r[2]
xgr $lo,@r[3]
xgr $hi,@r[3]
xgr $lo,$hi
stg $hi,16($rp)
stg $lo,8($rp)
___
} else {
$code.=<<___;
sllg %r3,%r3,32
sllg %r5,%r5,32
or %r3,%r4
or %r5,%r6
bras $ra,_mul_1x1
rllg $lo,$lo,32
rllg $hi,$hi,32
stmg $lo,$hi,0($rp)
___
}
$code.=<<___;
lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
br $ra
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,277 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2007.
#
# Performance improvement over vanilla C code varies from 85% to 45%
# depending on key length and benchmark. Unfortunately in this context
# these are not very impressive results [for code that utilizes "wide"
# 64x64=128-bit multiplication, which is not commonly available to C
# programmers], at least hand-coded bn_asm.c replacement is known to
# provide 30-40% better results for longest keys. Well, on a second
# thought it's not very surprising, because z-CPUs are single-issue
# and _strictly_ in-order execution, while bn_mul_mont is more or less
# dependent on CPU ability to pipe-line instructions and have several
# of them "in-flight" at the same time. I mean while other methods,
# for example Karatsuba, aim to minimize amount of multiplications at
# the cost of other operations increase, bn_mul_mont aim to neatly
# "overlap" multiplications and the other operations [and on most
# platforms even minimize the amount of the other operations, in
# particular references to memory]. But it's possible to improve this
# module performance by implementing dedicated squaring code-path and
# possibly by unrolling loops...
# January 2009.
#
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
# is achieved by swapping words after 64-bit loads, follow _dswap-s.
# On z990 it was measured to perform 2.6-2.2 times better than
# compiler-generated code, less for longer keys...
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$stdframe=16*$SIZE_T+4*8;
$mn0="%r0";
$num="%r1";
# int bn_mul_mont(
$rp="%r2"; # BN_ULONG *rp,
$ap="%r3"; # const BN_ULONG *ap,
$bp="%r4"; # const BN_ULONG *bp,
$np="%r5"; # const BN_ULONG *np,
$n0="%r6"; # const BN_ULONG *n0,
#$num="160(%r15)" # int num);
$bi="%r2"; # zaps rp
$j="%r7";
$ahi="%r8";
$alo="%r9";
$nhi="%r10";
$nlo="%r11";
$AHI="%r12";
$NHI="%r13";
$count="%r14";
$sp="%r15";
$code.=<<___;
.text
.globl bn_mul_mont
.type bn_mul_mont,\@function
bn_mul_mont:
lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
la $bp,0($num,$bp)
st${g} %r2,2*$SIZE_T($sp)
cghi $num,16 #
lghi %r2,0 #
blr %r14 # if($num<16) return 0;
___
$code.=<<___ if ($flavour =~ /3[12]/);
tmll $num,4
bnzr %r14 # if ($num&1) return 0;
___
$code.=<<___ if ($flavour !~ /3[12]/);
cghi $num,96 #
bhr %r14 # if($num>96) return 0;
___
$code.=<<___;
stm${g} %r3,%r15,3*$SIZE_T($sp)
lghi $rp,-$stdframe-8 # leave room for carry bit
lcgr $j,$num # -$num
lgr %r0,$sp
la $rp,0($rp,$sp)
la $sp,0($j,$rp) # alloca
st${g} %r0,0($sp) # back chain
sra $num,3 # restore $num
la $bp,0($j,$bp) # restore $bp
ahi $num,-1 # adjust $num for inner loop
lg $n0,0($n0) # pull n0
_dswap $n0
lg $bi,0($bp)
_dswap $bi
lg $alo,0($ap)
_dswap $alo
mlgr $ahi,$bi # ap[0]*bp[0]
lgr $AHI,$ahi
lgr $mn0,$alo # "tp[0]"*n0
msgr $mn0,$n0
lg $nlo,0($np) #
_dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.L1st:
lg $alo,0($j,$ap)
_dswap $alo
mlgr $ahi,$bi # ap[j]*bp[0]
algr $alo,$AHI
lghi $AHI,0
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
_dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
alcgr $nhi,$NHI # +="tp[j]"
algr $nlo,$alo
alcgr $NHI,$nhi
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.L1st
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI # upmost overflow bit
stg $NHI,$stdframe-8($j,$sp)
stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
.Louter:
lg $bi,0($bp) # bp[i]
_dswap $bi
lg $alo,0($ap)
_dswap $alo
mlgr $ahi,$bi # ap[0]*bp[i]
alg $alo,$stdframe($sp) # +=tp[0]
lghi $AHI,0
alcgr $AHI,$ahi
lgr $mn0,$alo
msgr $mn0,$n0 # tp[0]*n0
lg $nlo,0($np) # np[0]
_dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.Linner:
lg $alo,0($j,$ap)
_dswap $alo
mlgr $ahi,$bi # ap[j]*bp[i]
algr $alo,$AHI
lghi $AHI,0
alcgr $ahi,$AHI
alg $alo,$stdframe($j,$sp)# +=tp[j]
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
_dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
alcgr $nhi,$NHI
algr $nlo,$alo # +="tp[j]"
alcgr $NHI,$nhi
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.Linner
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI
alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
lghi $ahi,0
alcgr $AHI,$ahi # new upmost overflow bit
stg $NHI,$stdframe-8($j,$sp)
stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
jne .Louter
l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
la $ap,$stdframe($sp)
ahi $num,1 # restore $num, incidentally clears "borrow"
la $j,0(%r0)
lr $count,$num
.Lsub: lg $alo,0($j,$ap)
lg $nlo,0($j,$np)
_dswap $nlo
slbgr $alo,$nlo
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lsub
lghi $ahi,0
slbgr $AHI,$ahi # handle upmost carry
ngr $ap,$AHI
lghi $np,-1
xgr $np,$AHI
ngr $np,$rp
ogr $ap,$np # ap=borrow?tp:rp
la $j,0(%r0)
lgr $count,$num
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
_dswap $alo
stg $j,$stdframe($j,$sp) # zap tp
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lcopy
la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
lm${g} %r6,%r15,0(%r1)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,713 @@
.ident "s390x.S, version 1.1"
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
// granted according to the OpenSSL license. Warranty of any kind is
// disclaimed.
// ====================================================================
.text
#define zero %r0
// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
.globl bn_mul_add_words
.type bn_mul_add_words,@function
.align 4
bn_mul_add_words:
lghi zero,0 // zero = 0
la %r1,0(%r2) // put rp aside [to give way to]
lghi %r2,0 // return value
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
stmg %r6,%r13,48(%r15)
lghi %r2,3
lghi %r12,0 // carry = 0
slgr %r1,%r3 // rp-=ap
nr %r2,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_madd // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
lg %r7,0(%r3) // ap[0]
lg %r9,8(%r3) // ap[1]
mlgr %r6,%r5 // *=w
brct %r4,.Loop4_madd
j .Loop4_madd_tail
.Loop4_madd:
mlgr %r8,%r5
lg %r11,16(%r3) // ap[i+2]
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
mlgr %r10,%r5
lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,8(%r3,%r1)
stg %r9,8(%r3,%r1)
mlgr %r12,%r5
lg %r7,32(%r3)
alcgr %r11,%r8
alcgr %r10,zero
alg %r11,16(%r3,%r1)
stg %r11,16(%r3,%r1)
mlgr %r6,%r5
lg %r9,40(%r3)
alcgr %r13,%r10
alcgr %r12,zero
alg %r13,24(%r3,%r1)
stg %r13,24(%r3,%r1)
la %r3,32(%r3) // i+=4
brct %r4,.Loop4_madd
.Loop4_madd_tail:
mlgr %r8,%r5
lg %r11,16(%r3)
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
mlgr %r10,%r5
lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,8(%r3,%r1)
stg %r9,8(%r3,%r1)
mlgr %r12,%r5
alcgr %r11,%r8
alcgr %r10,zero
alg %r11,16(%r3,%r1)
stg %r11,16(%r3,%r1)
alcgr %r13,%r10
alcgr %r12,zero
alg %r13,24(%r3,%r1)
stg %r13,24(%r3,%r1)
la %r3,32(%r3) // i+=4
la %r2,1(%r2) // see if len%4 is zero ...
brct %r2,.Loop1_madd // without touching condition code:-)
.Lend_madd:
lgr %r2,zero // return value
alcgr %r2,%r12 // collect even carry bit
lmg %r6,%r13,48(%r15)
br %r14
.Loop1_madd:
lg %r7,0(%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
lgr %r12,%r6
la %r3,8(%r3) // i++
brct %r2,.Loop1_madd
j .Lend_madd
.size bn_mul_add_words,.-bn_mul_add_words
// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
.globl bn_mul_words
.type bn_mul_words,@function
.align 4
bn_mul_words:
lghi zero,0 // zero = 0
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0;
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
stmg %r6,%r10,48(%r15)
lghi %r10,3
lghi %r8,0 // carry = 0
nr %r10,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_mul // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
.Loop4_mul:
lg %r7,0(%r2,%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
stg %r7,0(%r2,%r1) // rp[i]=
lg %r9,8(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
stg %r9,8(%r2,%r1)
lg %r7,16(%r2,%r3)
mlgr %r6,%r5
alcgr %r7,%r8
stg %r7,16(%r2,%r1)
lg %r9,24(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
stg %r9,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r4,.Loop4_mul
la %r10,1(%r10) // see if len%4 is zero ...
brct %r10,.Loop1_mul // without touching condition code:-)
.Lend_mul:
alcgr %r8,zero // collect carry bit
lgr %r2,%r8
lmg %r6,%r10,48(%r15)
br %r14
.Loop1_mul:
lg %r7,0(%r2,%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
stg %r7,0(%r2,%r1) // rp[i]=
lgr %r8,%r6
la %r2,8(%r2) // i++
brct %r10,.Loop1_mul
j .Lend_mul
.size bn_mul_words,.-bn_mul_words
// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
.globl bn_sqr_words
.type bn_sqr_words,@function
.align 4
bn_sqr_words:
ltgfr %r4,%r4
bler %r14
stmg %r6,%r7,48(%r15)
srag %r1,%r4,2 // cnt=len/4
jz .Loop1_sqr
.Loop4_sqr:
lg %r7,0(%r3)
mlgr %r6,%r7
stg %r7,0(%r2)
stg %r6,8(%r2)
lg %r7,8(%r3)
mlgr %r6,%r7
stg %r7,16(%r2)
stg %r6,24(%r2)
lg %r7,16(%r3)
mlgr %r6,%r7
stg %r7,32(%r2)
stg %r6,40(%r2)
lg %r7,24(%r3)
mlgr %r6,%r7
stg %r7,48(%r2)
stg %r6,56(%r2)
la %r3,32(%r3)
la %r2,64(%r2)
brct %r1,.Loop4_sqr
lghi %r1,3
nr %r4,%r1 // cnt=len%4
jz .Lend_sqr
.Loop1_sqr:
lg %r7,0(%r3)
mlgr %r6,%r7
stg %r7,0(%r2)
stg %r6,8(%r2)
la %r3,8(%r3)
la %r2,16(%r2)
brct %r4,.Loop1_sqr
.Lend_sqr:
lmg %r6,%r7,48(%r15)
br %r14
.size bn_sqr_words,.-bn_sqr_words
// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
.globl bn_div_words
.type bn_div_words,@function
.align 4
bn_div_words:
dlgr %r2,%r4
lgr %r2,%r3
br %r14
.size bn_div_words,.-bn_div_words
// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
.globl bn_add_words
.type bn_add_words,@function
.align 4
bn_add_words:
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0
ltgfr %r5,%r5
bler %r14 // if (len<=0) return 0;
stg %r6,48(%r15)
lghi %r6,3
nr %r6,%r5 // len%4
sra %r5,2 // len/4, use sra because it sets condition code
jz .Loop1_add // carry is incidentally cleared if branch taken
algr %r2,%r2 // clear carry
.Loop4_add:
lg %r0,0(%r2,%r3)
alcg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
lg %r0,8(%r2,%r3)
alcg %r0,8(%r2,%r4)
stg %r0,8(%r2,%r1)
lg %r0,16(%r2,%r3)
alcg %r0,16(%r2,%r4)
stg %r0,16(%r2,%r1)
lg %r0,24(%r2,%r3)
alcg %r0,24(%r2,%r4)
stg %r0,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r5,.Loop4_add
la %r6,1(%r6) // see if len%4 is zero ...
brct %r6,.Loop1_add // without touching condition code:-)
.Lexit_add:
lghi %r2,0
alcgr %r2,%r2
lg %r6,48(%r15)
br %r14
.Loop1_add:
lg %r0,0(%r2,%r3)
alcg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
la %r2,8(%r2) // i++
brct %r6,.Loop1_add
j .Lexit_add
.size bn_add_words,.-bn_add_words
// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
.globl bn_sub_words
.type bn_sub_words,@function
.align 4
bn_sub_words:
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0
ltgfr %r5,%r5
bler %r14 // if (len<=0) return 0;
stg %r6,48(%r15)
lghi %r6,3
nr %r6,%r5 // len%4
sra %r5,2 // len/4, use sra because it sets condition code
jnz .Loop4_sub // borrow is incidentally cleared if branch taken
slgr %r2,%r2 // clear borrow
.Loop1_sub:
lg %r0,0(%r2,%r3)
slbg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
la %r2,8(%r2) // i++
brct %r6,.Loop1_sub
j .Lexit_sub
.Loop4_sub:
lg %r0,0(%r2,%r3)
slbg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
lg %r0,8(%r2,%r3)
slbg %r0,8(%r2,%r4)
stg %r0,8(%r2,%r1)
lg %r0,16(%r2,%r3)
slbg %r0,16(%r2,%r4)
stg %r0,16(%r2,%r1)
lg %r0,24(%r2,%r3)
slbg %r0,24(%r2,%r4)
stg %r0,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r5,.Loop4_sub
la %r6,1(%r6) // see if len%4 is zero ...
brct %r6,.Loop1_sub // without touching condition code:-)
.Lexit_sub:
lghi %r2,0
slbgr %r2,%r2
lcgr %r2,%r2
lg %r6,48(%r15)
br %r14
.size bn_sub_words,.-bn_sub_words
#define c1 %r1
#define c2 %r5
#define c3 %r8
#define mul_add_c(ai,bi,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlg %r6,bi*8(%r4); \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
.globl bn_mul_comba8
.type bn_mul_comba8,@function
.align 4
bn_mul_comba8:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
mul_add_c(0,0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
mul_add_c(0,1,c2,c3,c1);
mul_add_c(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
mul_add_c(2,0,c3,c1,c2);
mul_add_c(1,1,c3,c1,c2);
mul_add_c(0,2,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
mul_add_c(0,3,c1,c2,c3);
mul_add_c(1,2,c1,c2,c3);
mul_add_c(2,1,c1,c2,c3);
mul_add_c(3,0,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
mul_add_c(4,0,c2,c3,c1);
mul_add_c(3,1,c2,c3,c1);
mul_add_c(2,2,c2,c3,c1);
mul_add_c(1,3,c2,c3,c1);
mul_add_c(0,4,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
mul_add_c(0,5,c3,c1,c2);
mul_add_c(1,4,c3,c1,c2);
mul_add_c(2,3,c3,c1,c2);
mul_add_c(3,2,c3,c1,c2);
mul_add_c(4,1,c3,c1,c2);
mul_add_c(5,0,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
mul_add_c(6,0,c1,c2,c3);
mul_add_c(5,1,c1,c2,c3);
mul_add_c(4,2,c1,c2,c3);
mul_add_c(3,3,c1,c2,c3);
mul_add_c(2,4,c1,c2,c3);
mul_add_c(1,5,c1,c2,c3);
mul_add_c(0,6,c1,c2,c3);
stg c1,6*8(%r2)
lghi c1,0
mul_add_c(0,7,c2,c3,c1);
mul_add_c(1,6,c2,c3,c1);
mul_add_c(2,5,c2,c3,c1);
mul_add_c(3,4,c2,c3,c1);
mul_add_c(4,3,c2,c3,c1);
mul_add_c(5,2,c2,c3,c1);
mul_add_c(6,1,c2,c3,c1);
mul_add_c(7,0,c2,c3,c1);
stg c2,7*8(%r2)
lghi c2,0
mul_add_c(7,1,c3,c1,c2);
mul_add_c(6,2,c3,c1,c2);
mul_add_c(5,3,c3,c1,c2);
mul_add_c(4,4,c3,c1,c2);
mul_add_c(3,5,c3,c1,c2);
mul_add_c(2,6,c3,c1,c2);
mul_add_c(1,7,c3,c1,c2);
stg c3,8*8(%r2)
lghi c3,0
mul_add_c(2,7,c1,c2,c3);
mul_add_c(3,6,c1,c2,c3);
mul_add_c(4,5,c1,c2,c3);
mul_add_c(5,4,c1,c2,c3);
mul_add_c(6,3,c1,c2,c3);
mul_add_c(7,2,c1,c2,c3);
stg c1,9*8(%r2)
lghi c1,0
mul_add_c(7,3,c2,c3,c1);
mul_add_c(6,4,c2,c3,c1);
mul_add_c(5,5,c2,c3,c1);
mul_add_c(4,6,c2,c3,c1);
mul_add_c(3,7,c2,c3,c1);
stg c2,10*8(%r2)
lghi c2,0
mul_add_c(4,7,c3,c1,c2);
mul_add_c(5,6,c3,c1,c2);
mul_add_c(6,5,c3,c1,c2);
mul_add_c(7,4,c3,c1,c2);
stg c3,11*8(%r2)
lghi c3,0
mul_add_c(7,5,c1,c2,c3);
mul_add_c(6,6,c1,c2,c3);
mul_add_c(5,7,c1,c2,c3);
stg c1,12*8(%r2)
lghi c1,0
mul_add_c(6,7,c2,c3,c1);
mul_add_c(7,6,c2,c3,c1);
stg c2,13*8(%r2)
lghi c2,0
mul_add_c(7,7,c3,c1,c2);
stg c3,14*8(%r2)
stg c1,15*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_mul_comba8,.-bn_mul_comba8
// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
.globl bn_mul_comba4
.type bn_mul_comba4,@function
.align 4
bn_mul_comba4:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
mul_add_c(0,0,c1,c2,c3);
stg c1,0*8(%r3)
lghi c1,0
mul_add_c(0,1,c2,c3,c1);
mul_add_c(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
mul_add_c(2,0,c3,c1,c2);
mul_add_c(1,1,c3,c1,c2);
mul_add_c(0,2,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
mul_add_c(0,3,c1,c2,c3);
mul_add_c(1,2,c1,c2,c3);
mul_add_c(2,1,c1,c2,c3);
mul_add_c(3,0,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
mul_add_c(3,1,c2,c3,c1);
mul_add_c(2,2,c2,c3,c1);
mul_add_c(1,3,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
mul_add_c(2,3,c3,c1,c2);
mul_add_c(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
mul_add_c(3,3,c1,c2,c3);
stg c1,6*8(%r2)
stg c2,7*8(%r2)
stmg %r6,%r8,48(%r15)
br %r14
.size bn_mul_comba4,.-bn_mul_comba4
#define sqr_add_c(ai,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlgr %r6,%r7; \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
#define sqr_add_c2(ai,aj,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlg %r6,aj*8(%r3); \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero; \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
.globl bn_sqr_comba8
.type bn_sqr_comba8,@function
.align 4
bn_sqr_comba8:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
sqr_add_c(0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
sqr_add_c2(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
sqr_add_c(1,c3,c1,c2);
sqr_add_c2(2,0,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
sqr_add_c2(3,0,c1,c2,c3);
sqr_add_c2(2,1,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
sqr_add_c(2,c2,c3,c1);
sqr_add_c2(3,1,c2,c3,c1);
sqr_add_c2(4,0,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
sqr_add_c2(5,0,c3,c1,c2);
sqr_add_c2(4,1,c3,c1,c2);
sqr_add_c2(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
sqr_add_c(3,c1,c2,c3);
sqr_add_c2(4,2,c1,c2,c3);
sqr_add_c2(5,1,c1,c2,c3);
sqr_add_c2(6,0,c1,c2,c3);
stg c1,6*8(%r2)
lghi c1,0
sqr_add_c2(7,0,c2,c3,c1);
sqr_add_c2(6,1,c2,c3,c1);
sqr_add_c2(5,2,c2,c3,c1);
sqr_add_c2(4,3,c2,c3,c1);
stg c2,7*8(%r2)
lghi c2,0
sqr_add_c(4,c3,c1,c2);
sqr_add_c2(5,3,c3,c1,c2);
sqr_add_c2(6,2,c3,c1,c2);
sqr_add_c2(7,1,c3,c1,c2);
stg c3,8*8(%r2)
lghi c3,0
sqr_add_c2(7,2,c1,c2,c3);
sqr_add_c2(6,3,c1,c2,c3);
sqr_add_c2(5,4,c1,c2,c3);
stg c1,9*8(%r2)
lghi c1,0
sqr_add_c(5,c2,c3,c1);
sqr_add_c2(6,4,c2,c3,c1);
sqr_add_c2(7,3,c2,c3,c1);
stg c2,10*8(%r2)
lghi c2,0
sqr_add_c2(7,4,c3,c1,c2);
sqr_add_c2(6,5,c3,c1,c2);
stg c3,11*8(%r2)
lghi c3,0
sqr_add_c(6,c1,c2,c3);
sqr_add_c2(7,5,c1,c2,c3);
stg c1,12*8(%r2)
lghi c1,0
sqr_add_c2(7,6,c2,c3,c1);
stg c2,13*8(%r2)
lghi c2,0
sqr_add_c(7,c3,c1,c2);
stg c3,14*8(%r2)
stg c1,15*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_sqr_comba8,.-bn_sqr_comba8
// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
.globl bn_sqr_comba4
.type bn_sqr_comba4,@function
.align 4
bn_sqr_comba4:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
sqr_add_c(0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
sqr_add_c2(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
sqr_add_c(1,c3,c1,c2);
sqr_add_c2(2,0,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
sqr_add_c2(3,0,c1,c2,c3);
sqr_add_c2(2,1,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
sqr_add_c(2,c2,c3,c1);
sqr_add_c2(3,1,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
sqr_add_c2(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
sqr_add_c(3,c1,c2,c3);
stg c1,6*8(%r2)
stg c2,7*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_sqr_comba4,.-bn_sqr_comba4

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,190 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# October 2012
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: one suitable
# for all SPARCv9 processors and one for VIS3-capable ones. Former
# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
# ~100-230% faster than gcc-generated code and ~35-90% faster than
# the pure SPARCv9 code path.
$locals=16*8;
$tab="%l0";
@T=("%g2","%g3");
@i=("%g4","%g5");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
$code.=<<___;
#include <sparc_arch.h>
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
.globl bn_GF2m_mul_2x2
.align 16
bn_GF2m_mul_2x2:
SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
ld [%g1+0],%g1 ! OPENSSL_sparcv9cap_P[0]
andcc %g1, SPARCV9_VIS3, %g0
bz,pn %icc,.Lsoftware
nop
sllx %o1, 32, %o1
sllx %o3, 32, %o3
or %o2, %o1, %o1
or %o4, %o3, %o3
.word 0x95b262ab ! xmulx %o1, %o3, %o2
.word 0x99b262cb ! xmulxhi %o1, %o3, %o4
srlx %o2, 32, %o1 ! 13 cycles later
st %o2, [%o0+0]
st %o1, [%o0+4]
srlx %o4, 32, %o3
st %o4, [%o0+8]
retl
st %o3, [%o0+12]
.align 16
.Lsoftware:
save %sp,-STACK_FRAME-$locals,%sp
sllx %i1,32,$a
mov -1,$a12
sllx %i3,32,$b
or %i2,$a,$a
srlx $a12,1,$a48 ! 0x7fff...
or %i4,$b,$b
srlx $a12,2,$a12 ! 0x3fff...
add %sp,STACK_BIAS+STACK_FRAME,$tab
sllx $a,2,$a4
mov $a,$a1
sllx $a,1,$a2
srax $a4,63,@i[1] ! broadcast 61st bit
and $a48,$a4,$a4 ! (a<<2)&0x7fff...
srlx $a48,2,$a48
srax $a2,63,@i[0] ! broadcast 62nd bit
and $a12,$a2,$a2 ! (a<<1)&0x3fff...
srax $a1,63,$lo ! broadcast 63rd bit
and $a48,$a1,$a1 ! (a<<0)&0x1fff...
sllx $a1,3,$a8
and $b,$lo,$lo
and $b,@i[0],@i[0]
and $b,@i[1],@i[1]
stx %g0,[$tab+0*8] ! tab[0]=0
xor $a1,$a2,$a12
stx $a1,[$tab+1*8] ! tab[1]=a1
stx $a2,[$tab+2*8] ! tab[2]=a2
xor $a4,$a8,$a48
stx $a12,[$tab+3*8] ! tab[3]=a1^a2
xor $a4,$a1,$a1
stx $a4,[$tab+4*8] ! tab[4]=a4
xor $a4,$a2,$a2
stx $a1,[$tab+5*8] ! tab[5]=a1^a4
xor $a4,$a12,$a12
stx $a2,[$tab+6*8] ! tab[6]=a2^a4
xor $a48,$a1,$a1
stx $a12,[$tab+7*8] ! tab[7]=a1^a2^a4
xor $a48,$a2,$a2
stx $a8,[$tab+8*8] ! tab[8]=a8
xor $a48,$a12,$a12
stx $a1,[$tab+9*8] ! tab[9]=a1^a8
xor $a4,$a1,$a1
stx $a2,[$tab+10*8] ! tab[10]=a2^a8
xor $a4,$a2,$a2
stx $a12,[$tab+11*8] ! tab[11]=a1^a2^a8
xor $a4,$a12,$a12
stx $a48,[$tab+12*8] ! tab[12]=a4^a8
srlx $lo,1,$hi
stx $a1,[$tab+13*8] ! tab[13]=a1^a4^a8
sllx $lo,63,$lo
stx $a2,[$tab+14*8] ! tab[14]=a2^a4^a8
srlx @i[0],2,@T[0]
stx $a12,[$tab+15*8] ! tab[15]=a1^a2^a4^a8
sllx @i[0],62,$a1
sllx $b,3,@i[0]
srlx @i[1],3,@T[1]
and @i[0],`0xf<<3`,@i[0]
sllx @i[1],61,$a2
ldx [$tab+@i[0]],@i[0]
srlx $b,4-3,@i[1]
xor @T[0],$hi,$hi
and @i[1],`0xf<<3`,@i[1]
xor $a1,$lo,$lo
ldx [$tab+@i[1]],@i[1]
xor @T[1],$hi,$hi
xor @i[0],$lo,$lo
srlx $b,8-3,@i[0]
xor $a2,$lo,$lo
and @i[0],`0xf<<3`,@i[0]
___
for($n=1;$n<14;$n++) {
$code.=<<___;
sllx @i[1],`$n*4`,@T[0]
ldx [$tab+@i[0]],@i[0]
srlx @i[1],`64-$n*4`,@T[1]
xor @T[0],$lo,$lo
srlx $b,`($n+2)*4`-3,@i[1]
xor @T[1],$hi,$hi
and @i[1],`0xf<<3`,@i[1]
___
push(@i,shift(@i)); push(@T,shift(@T));
}
$code.=<<___;
sllx @i[1],`$n*4`,@T[0]
ldx [$tab+@i[0]],@i[0]
srlx @i[1],`64-$n*4`,@T[1]
xor @T[0],$lo,$lo
sllx @i[0],`($n+1)*4`,@T[0]
xor @T[1],$hi,$hi
srlx @i[0],`64-($n+1)*4`,@T[1]
xor @T[0],$lo,$lo
xor @T[1],$hi,$hi
srlx $lo,32,%i1
st $lo,[%i0+0]
st %i1,[%i0+4]
srlx $hi,32,%i2
st $hi,[%i0+8]
st %i2,[%i0+12]
ret
restore
.type bn_GF2m_mul_2x2,#function
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.asciz "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,606 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# December 2005
#
# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
# for undertaken effort are multiple. First of all, UltraSPARC is not
# the whole SPARCv9 universe and other VIS-free implementations deserve
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
# several integrated RSA/DSA accelerator circuits accessible through
# kernel driver [only(*)], but having decent user-land software
# implementation is important too. Finally, reasons like desire to
# experiment with dedicated squaring procedure. Yes, this module
# implements one, because it was easiest to draft it in SPARCv9
# instructions...
# (*) Engine accessing the driver in question is on my TODO list.
# For reference, acceleator is estimated to give 6 to 10 times
# improvement on single-threaded RSA sign. It should be noted
# that 6-10x improvement coefficient does not actually mean
# something extraordinary in terms of absolute [single-threaded]
# performance, as SPARCv9 instruction set is by all means least
# suitable for high performance crypto among other 64 bit
# platforms. 6-10x factor simply places T1 in same performance
# domain as say AMD64 and IA-64. Improvement of RSA verify don't
# appear impressive at all, but it's the sign operation which is
# far more critical/interesting.
# You might notice that inner loops are modulo-scheduled:-) This has
# essentially negligible impact on UltraSPARC performance, it's
# Fujitsu SPARC64 V users who should notice and hopefully appreciate
# the advantage... Currently this module surpasses sparcv9a-mont.pl
# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
# module still have hidden potential [see TODO list there], which is
# estimated to be larger than 20%...
# int bn_mul_mont(
$rp="%i0"; # BN_ULONG *rp,
$ap="%i1"; # const BN_ULONG *ap,
$bp="%i2"; # const BN_ULONG *bp,
$np="%i3"; # const BN_ULONG *np,
$n0="%i4"; # const BN_ULONG *n0,
$num="%i5"; # int num);
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=128; }
$car0="%o0";
$car1="%o1";
$car2="%o2"; # 1 bit
$acc0="%o3";
$acc1="%o4";
$mask="%g1"; # 32 bits, what a waste...
$tmp0="%g4";
$tmp1="%g5";
$i="%l0";
$j="%l1";
$mul0="%l2";
$mul1="%l3";
$tp="%l4";
$apj="%l5";
$npj="%l6";
$tpj="%l7";
$fname="bn_mul_mont_int";
$code=<<___;
.section ".text",#alloc,#execinstr
.global $fname
.align 32
$fname:
cmp %o5,4 ! 128 bits minimum
bge,pt %icc,.Lenter
sethi %hi(0xffffffff),$mask
retl
clr %o0
.align 32
.Lenter:
save %sp,-$frame,%sp
sll $num,2,$num ! num*=4
or $mask,%lo(0xffffffff),$mask
ld [$n0],$n0
cmp $ap,$bp
and $num,$mask,$num
ld [$bp],$mul0 ! bp[0]
nop
add %sp,$bias,%o7 ! real top of stack
ld [$ap],$car0 ! ap[0] ! redundant in squaring context
sub %o7,$num,%o7
ld [$ap+4],$apj ! ap[1]
and %o7,-1024,%o7
ld [$np],$car1 ! np[0]
sub %o7,$bias,%sp ! alloca
ld [$np+4],$npj ! np[1]
be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
mov 12,$j
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
and $car0,$mask,$acc0
add %sp,$bias+$frame,$tp
ld [$ap+8],$apj !prologue!
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
srlx $car0,32,$car0
add $acc0,$car1,$car1
ld [$np+8],$npj !prologue!
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.L1st:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
add $acc1,$car1,$car1
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$car1,$car1
add $j,4,$j ! j++
mov $tmp0,$acc0
st $car1,[$tp]
cmp $j,$num
mov $tmp1,$acc1
srlx $car1,32,$car1
bl %icc,.L1st
add $tp,4,$tp ! tp++
!.L1st
mulx $apj,$mul0,$tmp0 !epilogue!
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
add $acc1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $tmp0,$car0,$car0
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car1
add $car0,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
mov 4,$i ! i++
ld [$bp+4],$mul0 ! bp[1]
.Louter:
add %sp,$bias+$frame,$tp
ld [$ap],$car0 ! ap[0]
ld [$ap+4],$apj ! ap[1]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
ld [$tp],$tmp1 ! tp[0]
ld [$tp+4],$tpj ! tp[1]
mov 12,$j
mulx $car0,$mul0,$car0
mulx $apj,$mul0,$tmp0 !prologue!
add $tmp1,$car0,$car0
ld [$ap+8],$apj !prologue!
and $car0,$mask,$acc0
mulx $n0,$acc0,$mul1
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1
mulx $npj,$mul1,$acc1 !prologue!
srlx $car0,32,$car0
add $acc0,$car1,$car1
ld [$np+8],$npj !prologue!
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.Linner:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $tpj,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
add $acc0,$car0,$car0
add $acc1,$car1,$car1
ld [$np+$j],$npj ! np[j]
and $car0,$mask,$acc0
ld [$tp+8],$tpj ! tp[j]
srlx $car0,32,$car0
add $acc0,$car1,$car1
add $j,4,$j ! j++
mov $tmp0,$acc0
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
mov $tmp1,$acc1
cmp $j,$num
bl %icc,.Linner
add $tp,4,$tp ! tp++
!.Linner
mulx $apj,$mul0,$tmp0 !epilogue!
mulx $npj,$mul1,$tmp1
add $tpj,$car0,$car0
add $acc0,$car0,$car0
ld [$tp+8],$tpj ! tp[j]
and $car0,$mask,$acc0
add $acc1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $tpj,$car0,$car0
add $tmp0,$car0,$car0
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
add $acc0,$car1,$car1
st $car1,[$tp+4] ! tp[j-1]
srlx $car0,32,$car0
add $i,4,$i ! i++
srlx $car1,32,$car1
add $car0,$car1,$car1
cmp $i,$num
add $car2,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
bl,a %icc,.Louter
ld [$bp+$i],$mul0 ! bp[i]
!.Louter
add $tp,12,$tp
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
mov $tp,$ap
sub %g0,$num,%o7 ! k=-num
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
.align 16
.Lsub:
ld [$tp+%o7],%o0
ld [$np+%o7],%o1
subccc %o0,%o1,%o1 ! tp[j]-np[j]
add $rp,%o7,$i
add %o7,4,%o7
brnz %o7,.Lsub
st %o1,[$i]
subc $car2,0,$car2 ! handle upmost overflow bit
and $tp,$car2,$ap
andn $rp,$car2,$np
or $ap,$np,$ap
sub %g0,$num,%o7
.Lcopy:
ld [$ap+%o7],%o0 ! copy or in-place refresh
st %g0,[$tp+%o7] ! zap tp
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
nop
mov 1,%i0
ret
restore
___
########
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
$sbit="%i2"; # re-use $bp!
$code.=<<___;
.align 32
.Lbn_sqr_mont:
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
mulx $apj,$mul0,$tmp0 !prologue!
and $car0,$mask,$acc0
add %sp,$bias+$frame,$tp
ld [$ap+8],$apj !prologue!
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
srlx $car0,32,$car0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
mulx $npj,$mul1,$acc1 !prologue!
and $car0,1,$sbit
ld [$np+8],$npj !prologue!
srlx $car0,1,$car0
add $acc0,$car1,$car1
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.Lsqr_1st:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0 ! ap[j]*a0+c0
add $acc1,$car1,$car1
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
mov $tmp1,$acc1
srlx $acc0,32,$sbit
add $j,4,$j ! j++
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
st $car1,[$tp]
mov $tmp0,$acc0
srlx $car1,32,$car1
bl %icc,.Lsqr_1st
add $tp,4,$tp ! tp++
!.Lsqr_1st
mulx $apj,$mul0,$tmp0 ! epilogue
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0 ! ap[j]*a0+c0
add $acc1,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $tmp0,$car0,$car0 ! ap[j]*a0+c0
add $tmp1,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
ld [%sp+$bias+$frame],$tmp0 ! tp[0]
ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
ld [%sp+$bias+$frame+8],$tpj ! tp[2]
ld [$ap+4],$mul0 ! ap[1]
ld [$ap+8],$apj ! ap[2]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp0,$mul1
mulx $mul0,$mul0,$car0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1
mulx $npj,$mul1,$acc1
add $tmp0,$car1,$car1
and $car0,$mask,$acc0
ld [$np+8],$npj ! np[2]
srlx $car1,32,$car1
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
and $car0,1,$sbit
add $acc1,$car1,$car1
srlx $car0,1,$car0
mov 12,$j
st $car1,[%sp+$bias+$frame] ! tp[0]=
srlx $car1,32,$car1
add %sp,$bias+$frame+4,$tp
.Lsqr_2nd:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc1,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
bl %icc,.Lsqr_2nd
add $tp,4,$tp ! tp++
!.Lsqr_2nd
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
ld [$ap+8],$mul0 ! ap[2]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp1,$mul1
and $mul1,$mask,$mul1
mov 8,$i
mulx $mul0,$mul0,$car0
mulx $car1,$mul1,$car1
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add %sp,$bias+$frame,$tp
srlx $car1,32,$car1
and $car0,1,$sbit
srlx $car0,1,$car0
mov 4,$j
.Lsqr_outer:
.Lsqr_inner1:
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $j,4,$j
ld [$tp+8],$tpj
cmp $j,$i
add $acc1,$car1,$car1
ld [$np+$j],$npj
st $car1,[$tp]
srlx $car1,32,$car1
bl %icc,.Lsqr_inner1
add $tp,4,$tp
!.Lsqr_inner1
add $j,4,$j
ld [$ap+$j],$apj ! ap[j]
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
ld [$np+$j],$npj ! np[j]
add $acc0,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $j,4,$j
cmp $j,$num
be,pn %icc,.Lsqr_no_inner2
add $tp,4,$tp
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
or $sbit,$acc0,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
bl %icc,.Lsqr_inner2
add $tp,4,$tp ! tp++
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
add $i,4,$i ! i++
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
ld [$ap+$i],$mul0 ! ap[j]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp1,$mul1
and $mul1,$mask,$mul1
add $i,4,$tmp0
mulx $mul0,$mul0,$car0
mulx $car1,$mul1,$car1
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add %sp,$bias+$frame,$tp
srlx $car1,32,$car1
and $car0,1,$sbit
srlx $car0,1,$car0
cmp $tmp0,$num ! i<num-1
bl %icc,.Lsqr_outer
mov 4,$j
.Lsqr_last:
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $j,4,$j
ld [$tp+8],$tpj
cmp $j,$i
add $acc1,$car1,$car1
ld [$np+$j],$npj
st $car1,[$tp]
srlx $car1,32,$car1
bl %icc,.Lsqr_last
add $tp,4,$tp
!.Lsqr_last
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
ba .Ltail
add $tp,8,$tp
.type $fname,#function
.size $fname,(.-$fname)
.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 32
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,882 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
# Because unlike integer multiplier, which simply stalls whole CPU,
# FPU is fully pipelined and can effectively emit 48 bit partial
# product every cycle. Why not blended SPARC v9? One can argue that
# making this module dependent on UltraSPARC VIS extension limits its
# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
# implementations from compatibility matrix. But the rest, whole Sun
# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
# VIS extension instructions used in this module. This is considered
# good enough to not care about HAL SPARC64 users [if any] who have
# integer-only pure SPARCv9 module to "fall down" to.
# USI&II cores currently exhibit uniform 2x improvement [over pre-
# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
# performance improves few percents for shorter keys and worsens few
# percents for longer keys. This is because USIII integer multiplier
# is >3x faster than USI&II one, which is harder to match [but see
# TODO list below]. It should also be noted that SPARC64 V features
# out-of-order execution, which *might* mean that integer multiplier
# is pipelined, which in turn *might* be impossible to match... On
# additional note, SPARC64 V implements FP Multiply-Add instruction,
# which is perfectly usable in this context... In other words, as far
# as Fujitsu SPARC64 V goes, talk to the author:-)
# The implementation implies following "non-natural" limitations on
# input arguments:
# - num may not be less than 4;
# - num has to be even;
# Failure to meet either condition has no fatal effects, simply
# doesn't give any performance gain.
# TODO:
# - modulo-schedule inner loop for better performance (on in-order
# execution core such as UltraSPARC this shall result in further
# noticeable(!) improvement);
# - dedicated squaring procedure[?];
######################################################################
# November 2006
#
# Modulo-scheduled inner loops allow to interleave floating point and
# integer instructions and minimize Read-After-Write penalties. This
# results in *further* 20-50% perfromance improvement [depending on
# key length, more for longer keys] on USI&II cores and 30-80% - on
# USIII&IV.
$fname="bn_mul_mont_fpu";
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) {
$bias=2047;
$frame=192;
} else {
$bias=0;
$frame=128; # 96 rounded up to largest known cache-line
}
$locals=64;
# In order to provide for 32-/64-bit ABI duality, I keep integers wider
# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
# exclusively for pointers, indexes and other small values...
# int bn_mul_mont(
$rp="%i0"; # BN_ULONG *rp,
$ap="%i1"; # const BN_ULONG *ap,
$bp="%i2"; # const BN_ULONG *bp,
$np="%i3"; # const BN_ULONG *np,
$n0="%i4"; # const BN_ULONG *n0,
$num="%i5"; # int num);
$tp="%l0"; # t[num]
$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
$ap_h="%l2"; # to these four vectors as double-precision FP values.
$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
$np_h="%l4"; # loop and L1-cache aliasing is minimized...
$i="%l5";
$j="%l6";
$mask="%l7"; # 16-bit mask, 0xffff
$n0="%g4"; # reassigned(!) to "64-bit" register
$carry="%i4"; # %i4 reused(!) for a carry bit
# FP register naming chart
#
# ..HILO
# dcba
# --------
# LOa
# LOb
# LOc
# LOd
# HIa
# HIb
# HIc
# HId
# ..a
# ..b
$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
$dota="%f24"; $dotb="%f26";
$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
$code=<<___;
.section ".text",#alloc,#execinstr
.global $fname
.align 32
$fname:
save %sp,-$frame-$locals,%sp
cmp $num,4
bl,a,pn %icc,.Lret
clr %i0
andcc $num,1,%g0 ! $num has to be even...
bnz,a,pn %icc,.Lret
clr %i0 ! signal "unsupported input value"
srl $num,1,$num
sethi %hi(0xffff),$mask
ld [%i4+0],$n0 ! $n0 reassigned, remember?
or $mask,%lo(0xffff),$mask
ld [%i4+4],%o0
sllx %o0,32,%o0
or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
sll $num,3,$num ! num*=8
add %sp,$bias,%o0 ! real top of stack
sll $num,2,%o1
add %o1,$num,%o1 ! %o1=num*5
sub %o0,%o1,%o0
and %o0,-2048,%o0 ! optimize TLB utilization
sub %o0,$bias,%sp ! alloca(5*num*8)
rd %asi,%o7 ! save %asi
add %sp,$bias+$frame+$locals,$tp
add $tp,$num,$ap_l
add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
add $ap_l,$num,$ap_h
add $ap_h,$num,$np_l
add $np_l,$num,$np_h
wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
add $rp,$num,$rp ! readjust input pointers to point
add $ap,$num,$ap ! at the ends too...
add $bp,$num,$bp
add $np,$num,$np
stx %o7,[%sp+$bias+$frame+48] ! save %asi
sub %g0,$num,$i ! i=-num
sub %g0,$num,$j ! j=-num
add $ap,$j,%o3
add $bp,$i,%o4
ld [%o3+4],%g1 ! bp[0]
ld [%o3+0],%o0
ld [%o4+4],%g5 ! ap[0]
sllx %g1,32,%g1
ld [%o4+0],%o1
sllx %g5,32,%g5
or %g1,%o0,%o0
or %g5,%o1,%o1
add $np,$j,%o5
mulx %o1,%o0,%o0 ! ap[0]*bp[0]
mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
stx %o0,[%sp+$bias+$frame+0]
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o3+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,$ba
fxtod $alo,$alo
ldda [%o4+0]%asi,$bb
fxtod $ahi,$ahi
ldda [%o4+6]%asi,$bc
fxtod $nlo,$nlo
ldda [%o4+4]%asi,$bd
fxtod $nhi,$nhi
! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
ldda [%sp+$bias+$frame+6]%asi,$na
fxtod $ba,$ba
ldda [%sp+$bias+$frame+4]%asi,$nb
fxtod $bb,$bb
ldda [%sp+$bias+$frame+2]%asi,$nc
fxtod $bc,$bc
ldda [%sp+$bias+$frame+0]%asi,$nd
fxtod $bd,$bd
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fxtod $na,$na
std $ahi,[$ap_h+$j]
fxtod $nb,$nb
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fxtod $nc,$nc
std $nhi,[$np_h+$j]
fxtod $nd,$nd
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
fmuld $alo,$bd,$alod
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
fmuld $ahi,$ba,$ahia
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
add $j,8,$j
std $nlob,[%sp+$bias+$frame+8]
add $ap,$j,%o4
std $nloc,[%sp+$bias+$frame+16]
add $np,$j,%o5
std $nlod,[%sp+$bias+$frame+24]
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o4+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
fxtod $alo,$alo
fxtod $ahi,$ahi
fxtod $nlo,$nlo
fxtod $nhi,$nhi
ldx [%sp+$bias+$frame+0],%o0
fmuld $alo,$ba,$aloa
ldx [%sp+$bias+$frame+8],%o1
fmuld $nlo,$na,$nloa
ldx [%sp+$bias+$frame+16],%o2
fmuld $alo,$bb,$alob
ldx [%sp+$bias+$frame+24],%o3
fmuld $nlo,$nb,$nlob
srlx %o0,16,%o7
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fmuld $alo,$bc,$aloc
add %o7,%o1,%o1
std $ahi,[$ap_h+$j]
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
srlx %o1,16,%o7
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fmuld $alo,$bd,$alod
add %o7,%o2,%o2
std $nhi,[$np_h+$j]
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
srlx %o2,16,%o7
fmuld $ahi,$ba,$ahia
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
!and %o0,$mask,%o0
!and %o1,$mask,%o1
!and %o2,$mask,%o2
!sllx %o1,16,%o1
!sllx %o2,32,%o2
!sllx %o3,48,%o7
!or %o1,%o0,%o0
!or %o2,%o0,%o0
!or %o7,%o0,%o0 ! 64-bit result
srlx %o3,16,%g1 ! 34-bit carry
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $dota,$nloa,$nloa
faddd $dotb,$nlob,$nlob
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
addcc $j,8,$j
std $nloc,[%sp+$bias+$frame+16]
bz,pn %icc,.L1stskip
std $nlod,[%sp+$bias+$frame+24]
.align 32 ! incidentally already aligned !
.L1st:
add $ap,$j,%o4
add $np,$j,%o5
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o4+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
fxtod $alo,$alo
fxtod $ahi,$ahi
fxtod $nlo,$nlo
fxtod $nhi,$nhi
ldx [%sp+$bias+$frame+0],%o0
fmuld $alo,$ba,$aloa
ldx [%sp+$bias+$frame+8],%o1
fmuld $nlo,$na,$nloa
ldx [%sp+$bias+$frame+16],%o2
fmuld $alo,$bb,$alob
ldx [%sp+$bias+$frame+24],%o3
fmuld $nlo,$nb,$nlob
srlx %o0,16,%o7
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fmuld $alo,$bc,$aloc
add %o7,%o1,%o1
std $ahi,[$ap_h+$j]
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
srlx %o1,16,%o7
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fmuld $alo,$bd,$alod
add %o7,%o2,%o2
std $nhi,[$np_h+$j]
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
srlx %o2,16,%o7
fmuld $ahi,$ba,$ahia
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
and %o1,$mask,%o1
and %o2,$mask,%o2
fmuld $ahi,$bb,$ahib
sllx %o1,16,%o1
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
sllx %o2,32,%o2
fmuld $ahi,$bc,$ahic
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
or %o2,%o0,%o0
fmuld $ahi,$bd,$ahid
or %o7,%o0,%o0 ! 64-bit result
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
addcc %g1,%o0,%o0
faddd $dota,$nloa,$nloa
srlx %o3,16,%g1 ! 34-bit carry
faddd $dotb,$nlob,$nlob
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]=
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
std $nlod,[%sp+$bias+$frame+24]
addcc $j,8,$j
bnz,pt %icc,.L1st
add $tp,8,$tp
.L1stskip:
fdtox $dota,$dota
fdtox $dotb,$dotb
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
ldx [%sp+$bias+$frame+24],%o3
srlx %o0,16,%o7
std $dota,[%sp+$bias+$frame+32]
add %o7,%o1,%o1
std $dotb,[%sp+$bias+$frame+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
and %o1,$mask,%o1
and %o2,$mask,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+$bias+$frame+32],%o4
addcc %g1,%o0,%o0
ldx [%sp+$bias+$frame+40],%o5
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]=
add $tp,8,$tp
srlx %o4,16,%o7
add %o7,%o5,%o5
and %o4,$mask,%o4
sllx %o5,16,%o7
or %o7,%o4,%o4
addcc %g1,%o4,%o4
srlx %o5,48,%g1
bcs,a %xcc,.+8
add %g1,1,%g1
mov %g1,$carry
stx %o4,[$tp] ! tp[num-1]=
ba .Louter
add $i,8,$i
.align 32
.Louter:
sub %g0,$num,$j ! j=-num
add %sp,$bias+$frame+$locals,$tp
add $ap,$j,%o3
add $bp,$i,%o4
ld [%o3+4],%g1 ! bp[i]
ld [%o3+0],%o0
ld [%o4+4],%g5 ! ap[0]
sllx %g1,32,%g1
ld [%o4+0],%o1
sllx %g5,32,%g5
or %g1,%o0,%o0
or %g5,%o1,%o1
ldx [$tp],%o2 ! tp[0]
mulx %o1,%o0,%o0
addcc %o2,%o0,%o0
mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
stx %o0,[%sp+$bias+$frame+0]
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,$ba
ldda [%o4+0]%asi,$bb
ldda [%o4+6]%asi,$bc
ldda [%o4+4]%asi,$bd
! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
ldda [%sp+$bias+$frame+6]%asi,$na
fxtod $ba,$ba
ldda [%sp+$bias+$frame+4]%asi,$nb
fxtod $bb,$bb
ldda [%sp+$bias+$frame+2]%asi,$nc
fxtod $bc,$bc
ldda [%sp+$bias+$frame+0]%asi,$nd
fxtod $bd,$bd
ldd [$ap_l+$j],$alo ! load a[j] in double format
fxtod $na,$na
ldd [$ap_h+$j],$ahi
fxtod $nb,$nb
ldd [$np_l+$j],$nlo ! load n[j] in double format
fxtod $nc,$nc
ldd [$np_h+$j],$nhi
fxtod $nd,$nd
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
fmuld $alo,$bd,$alod
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
fmuld $ahi,$ba,$ahia
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
add $j,8,$j
std $nlod,[%sp+$bias+$frame+24]
ldd [$ap_l+$j],$alo ! load a[j] in double format
ldd [$ap_h+$j],$ahi
ldd [$np_l+$j],$nlo ! load n[j] in double format
ldd [$np_h+$j],$nhi
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
ldx [%sp+$bias+$frame+0],%o0
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
ldx [%sp+$bias+$frame+8],%o1
fmuld $alo,$bd,$alod
ldx [%sp+$bias+$frame+16],%o2
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
ldx [%sp+$bias+$frame+24],%o3
fmuld $ahi,$ba,$ahia
srlx %o0,16,%o7
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
add %o7,%o1,%o1
fmuld $ahi,$bb,$ahib
srlx %o1,16,%o7
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
add %o7,%o2,%o2
fmuld $ahi,$bc,$ahic
srlx %o2,16,%o7
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
! why?
and %o0,$mask,%o0
fmuld $ahi,$bd,$ahid
and %o1,$mask,%o1
and %o2,$mask,%o2
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
sllx %o1,16,%o1
faddd $dota,$nloa,$nloa
sllx %o2,32,%o2
faddd $dotb,$nlob,$nlob
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahic,$nhic,$dota ! $nhic
or %o2,%o0,%o0
faddd $ahid,$nhid,$dotb ! $nhid
or %o7,%o0,%o0 ! 64-bit result
ldx [$tp],%o7
faddd $nloc,$nhia,$nloc
addcc %o7,%o0,%o0
! end-of-why?
faddd $nlod,$nhib,$nlod
srlx %o3,16,%g1 ! 34-bit carry
fdtox $nloa,$nloa
bcs,a %xcc,.+8
add %g1,1,%g1
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
addcc $j,8,$j
std $nloc,[%sp+$bias+$frame+16]
bz,pn %icc,.Linnerskip
std $nlod,[%sp+$bias+$frame+24]
ba .Linner
nop
.align 32
.Linner:
ldd [$ap_l+$j],$alo ! load a[j] in double format
ldd [$ap_h+$j],$ahi
ldd [$np_l+$j],$nlo ! load n[j] in double format
ldd [$np_h+$j],$nhi
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
ldx [%sp+$bias+$frame+0],%o0
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
ldx [%sp+$bias+$frame+8],%o1
fmuld $alo,$bd,$alod
ldx [%sp+$bias+$frame+16],%o2
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
ldx [%sp+$bias+$frame+24],%o3
fmuld $ahi,$ba,$ahia
srlx %o0,16,%o7
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
add %o7,%o1,%o1
fmuld $ahi,$bb,$ahib
srlx %o1,16,%o7
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
add %o7,%o2,%o2
fmuld $ahi,$bc,$ahic
srlx %o2,16,%o7
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
fmuld $ahi,$bd,$ahid
and %o1,$mask,%o1
and %o2,$mask,%o2
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
sllx %o1,16,%o1
faddd $dota,$nloa,$nloa
sllx %o2,32,%o2
faddd $dotb,$nlob,$nlob
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahic,$nhic,$dota ! $nhic
or %o2,%o0,%o0
faddd $ahid,$nhid,$dotb ! $nhid
or %o7,%o0,%o0 ! 64-bit result
faddd $nloc,$nhia,$nloc
addcc %g1,%o0,%o0
ldx [$tp+8],%o7 ! tp[j]
faddd $nlod,$nhib,$nlod
srlx %o3,16,%g1 ! 34-bit carry
fdtox $nloa,$nloa
bcs,a %xcc,.+8
add %g1,1,%g1
fdtox $nlob,$nlob
addcc %o7,%o0,%o0
fdtox $nloc,$nloc
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
addcc $j,8,$j
std $nlod,[%sp+$bias+$frame+24]
bnz,pt %icc,.Linner
add $tp,8,$tp
.Linnerskip:
fdtox $dota,$dota
fdtox $dotb,$dotb
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
ldx [%sp+$bias+$frame+24],%o3
srlx %o0,16,%o7
std $dota,[%sp+$bias+$frame+32]
add %o7,%o1,%o1
std $dotb,[%sp+$bias+$frame+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
and %o1,$mask,%o1
and %o2,$mask,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
ldx [%sp+$bias+$frame+32],%o4
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+$bias+$frame+40],%o5
addcc %g1,%o0,%o0
ldx [$tp+8],%o7 ! tp[j]
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
addcc %o7,%o0,%o0
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]
add $tp,8,$tp
srlx %o4,16,%o7
add %o7,%o5,%o5
and %o4,$mask,%o4
sllx %o5,16,%o7
or %o7,%o4,%o4
addcc %g1,%o4,%o4
srlx %o5,48,%g1
bcs,a %xcc,.+8
add %g1,1,%g1
addcc $carry,%o4,%o4
stx %o4,[$tp] ! tp[num-1]
mov %g1,$carry
bcs,a %xcc,.+8
add $carry,1,$carry
addcc $i,8,$i
bnz %icc,.Louter
nop
add $tp,8,$tp ! adjust tp to point at the end
orn %g0,%g0,%g4
sub %g0,$num,%o7 ! n=-num
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
.align 32
.Lsub:
ldx [$tp+%o7],%o0
add $np,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
srlx %o0,32,%o1
subccc %o0,%o2,%o2
add $rp,%o7,%g1
subccc %o1,%o3,%o3
st %o2,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lsub
st %o3,[%g1+4]
subc $carry,0,%g4
sub %g0,$num,%o7 ! n=-num
ba .Lcopy
nop
.align 32
.Lcopy:
ldx [$tp+%o7],%o0
add $rp,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
stx %g0,[$tp+%o7]
and %o0,%g4,%o0
srlx %o0,32,%o1
andn %o2,%g4,%o2
andn %o3,%g4,%o3
or %o2,%o0,%o0
or %o3,%o1,%o1
st %o0,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lcopy
st %o1,[%g1+4]
sub %g0,$num,%o7 ! n=-num
.Lzap:
stx %g0,[$ap_l+%o7]
stx %g0,[$ap_h+%o7]
stx %g0,[$np_l+%o7]
stx %g0,[$np_h+%o7]
add %o7,8,%o7
brnz,pt %o7,.Lzap
nop
ldx [%sp+$bias+$frame+48],%o7
wr %g0,%o7,%asi ! restore %asi
mov 1,%i0
.Lret:
ret
restore
.type $fname,#function
.size $fname,(.-$fname)
.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
.align 32
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
# Below substitution makes it possible to compile without demanding
# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
# dare to do this, because VIS capability is detected at run-time now
# and this routine is not called on CPU not capable to execute it. Do
# note that fzeros is not the only VIS dependency! Another dependency
# is implicit and is just _a_ numerical value loaded to %asi register,
# which assembler can't recognize as VIS specific...
$code =~ s/fzeros\s+%f([0-9]+)/
sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
/gem;
print $code;
# flush
close STDOUT;

View File

@@ -0,0 +1,242 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Wrapper around 'rep montmul', VIA-specific instruction accessing
# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
#
# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
# different software configurations on 1.5GHz VIA Esther processor.
# Lines marked with "software integer" denote performance of hand-
# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
# refers to hand-coded SSE2 Montgomery multiplication procedure found
# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
# Padlock SDK 2.0.1 available for download from VIA, which naturally
# utilizes the magic 'repz montmul' instruction. And finally "hardware
# this" refers to *this* implementation which also uses 'repz montmul'
#
# sign verify sign/s verify/s
# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
#
# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
#
# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
#
# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
#
# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
#
# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
#
# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
#
# To give you some other reference point here is output for 2.4GHz P4
# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
# SSE2" in above terms.
#
# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
#
# Conclusions:
# - VIA SDK leaves a *lot* of room for improvement (which this
# implementation successfully fills:-);
# - 'rep montmul' gives up to >3x performance improvement depending on
# key length;
# - in terms of absolute performance it delivers approximately as much
# as modern out-of-order 32-bit cores [again, for longer keys].
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"via-mont.pl");
# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
$func="bn_mul_mont_padlock";
$pad=16*1; # amount of reserved bytes on top of every vector
# stack layout
$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
$A=&DWP(4,"esp");
$B=&DWP(8,"esp");
$T=&DWP(12,"esp");
$M=&DWP(16,"esp");
$scratch=&DWP(20,"esp");
$rp=&DWP(24,"esp"); # these are mine
$sp=&DWP(28,"esp");
# &DWP(32,"esp") # 32 byte scratch area
# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
# Note that SDK suggests to unconditionally allocate 2K per vector. This
# has quite an impact on performance. It naturally depends on key length,
# but to give an example 1024 bit private RSA key operations suffer >30%
# penalty. I allocate only as much as actually required...
&function_begin($func);
&xor ("eax","eax");
&mov ("ecx",&wparam(5)); # num
# meet VIA's limitations for num [note that the specification
# expresses them in bits, while we work with amount of 32-bit words]
&test ("ecx",3);
&jnz (&label("leave")); # num % 4 != 0
&cmp ("ecx",8);
&jb (&label("leave")); # num < 8
&cmp ("ecx",1024);
&ja (&label("leave")); # num > 1024
&pushf ();
&cld ();
&mov ("edi",&wparam(0)); # rp
&mov ("eax",&wparam(1)); # ap
&mov ("ebx",&wparam(2)); # bp
&mov ("edx",&wparam(3)); # np
&mov ("esi",&wparam(4)); # n0
&mov ("esi",&DWP(0,"esi")); # *n0
&lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
&lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
&neg ("ebp");
&add ("ebp","esp");
&and ("ebp",-64); # align to cache-line
&xchg ("ebp","esp"); # alloca
&mov ($rp,"edi"); # save rp
&mov ($sp,"ebp"); # save esp
&mov ($mZeroPrime,"esi");
&lea ("esi",&DWP(64,"esp")); # tp
&mov ($T,"esi");
&lea ("edi",&DWP(32,"esp")); # scratch area
&mov ($scratch,"edi");
&mov ("esi","eax");
&lea ("ebp",&DWP(-$pad,"ecx"));
&shr ("ebp",2); # restore original num value in ebp
&xor ("eax","eax");
&mov ("ecx","ebp");
&lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("ecx","ebp");
&lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
&mov ($A,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded ap copy...
&mov ("ecx","ebp");
&mov ("esi","ebx");
&mov ($B,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded bp copy...
&mov ("ecx","ebp");
&mov ("esi","edx");
&mov ($M,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded np copy...
# let magic happen...
&mov ("ecx","ebp");
&mov ("esi","esp");
&shl ("ecx",5); # convert word counter to bit counter
&align (4);
&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
&mov ("ecx","ebp");
&lea ("esi",&DWP(64,"esp")); # tp
# edi still points at the end of padded np copy...
&neg ("ebp");
&lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
&mov ("edi",$rp); # restore rp
&xor ("edx","edx"); # i=0 and clear CF
&set_label("sub",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&sbb ("eax",&DWP(0,"ebp","edx",4));
&mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
&lea ("edx",&DWP(1,"edx")); # i++
&loop (&label("sub")); # doesn't affect CF!
&mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
&sbb ("eax",0);
&and ("esi","eax");
&not ("eax");
&mov ("ebp","edi");
&and ("ebp","eax");
&or ("esi","ebp"); # tp=carry?tp:rp
&mov ("ecx","edx"); # num
&xor ("edx","edx"); # i=0
&set_label("copy",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
&mov (&DWP(0,"edi","edx",4),"eax");
&lea ("edx",&DWP(1,"edx")); # i++
&loop (&label("copy"));
&mov ("ebp",$sp);
&xor ("eax","eax");
&mov ("ecx",64/4);
&mov ("edi","esp"); # zap frame including scratch area
&data_byte(0xf3,0xab); # rep stosl, bzero
# zap copies of ap, bp and np
&lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
&lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("esp","ebp");
&inc ("eax"); # signal "done"
&popf ();
&set_label("leave");
&function_end($func);
&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -0,0 +1,373 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2012.
#
# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
# onward. There are three new instructions used here: umulxhi,
# addxc[cc] and initializing store. On T3 RSA private key operations
# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
# lengths. This is without dedicated squaring procedure. On T4
# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
# for reference purposes, because T4 has dedicated Montgomery
# multiplication and squaring *instructions* that deliver even more.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.section ".text",#alloc,#execinstr
___
($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
# int bn_mul_mont(
$rp="%o0"; # BN_ULONG *rp,
$ap="%o1"; # const BN_ULONG *ap,
$bp="%o2"; # const BN_ULONG *bp,
$np="%o3"; # const BN_ULONG *np,
$n0p="%o4"; # const BN_ULONG *n0,
$num="%o5"; # int num); # caller ensures that num is even
# and >=6
$code.=<<___;
.globl bn_mul_mont_vis3
.align 32
bn_mul_mont_vis3:
add %sp, $bias, %g4 ! real top of stack
sll $num, 2, $num ! size in bytes
add $num, 63, %g5
andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
add %g5, %g5, %g1
add %g5, %g1, %g1 ! 3*buffer size
sub %g4, %g1, %g1
andn %g1, 63, %g1 ! align at 64 byte
sub %g1, $frame, %g1 ! new top of stack
sub %g1, %g4, %g1
save %sp, %g1, %sp
___
# +-------------------------------+<----- %sp
# . .
# +-------------------------------+<----- aligned at 64 bytes
# | __int64 tmp[0] |
# +-------------------------------+
# . .
# . .
# +-------------------------------+<----- aligned at 64 bytes
# | __int64 ap[1..0] | converted ap[]
# +-------------------------------+
# | __int64 np[1..0] | converted np[]
# +-------------------------------+
# | __int64 ap[3..2] |
# . .
# . .
# +-------------------------------+
($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
($ovf,$i)=($t0,$t1);
$code.=<<___;
ld [$n0p+0], $t0 ! pull n0[0..1] value
add %sp, $bias+$frame, $tp
ld [$n0p+4], $t1
add $tp, %g5, $anp
ld [$bp+0], $t2 ! m0=bp[0]
sllx $t1, 32, $n0
ld [$bp+4], $t3
or $t0, $n0, $n0
add $bp, 8, $bp
ld [$ap+0], $t0 ! ap[0]
sllx $t3, 32, $m0
ld [$ap+4], $t1
or $t2, $m0, $m0
ld [$ap+8], $t2 ! ap[1]
sllx $t1, 32, $aj
ld [$ap+12], $t3
or $t0, $aj, $aj
add $ap, 16, $ap
stx $aj, [$anp] ! converted ap[0]
mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
umulxhi $aj, $m0, $hi0
ld [$np+0], $t0 ! np[0]
sllx $t3, 32, $aj
ld [$np+4], $t1
or $t2, $aj, $aj
ld [$np+8], $t2 ! np[1]
sllx $t1, 32, $nj
ld [$np+12], $t3
or $t0, $nj, $nj
add $np, 16, $np
stx $nj, [$anp+8] ! converted np[0]
mulx $lo0, $n0, $m1 ! "tp[0]"*n0
stx $aj, [$anp+16] ! converted ap[1]
mulx $aj, $m0, $alo ! ap[1]*bp[0]
umulxhi $aj, $m0, $aj ! ahi=aj
mulx $nj, $m1, $lo1 ! np[0]*m1
umulxhi $nj, $m1, $hi1
sllx $t3, 32, $nj
or $t2, $nj, $nj
stx $nj, [$anp+24] ! converted np[1]
add $anp, 32, $anp
addcc $lo0, $lo1, $lo1
addxc %g0, $hi1, $hi1
mulx $nj, $m1, $nlo ! np[1]*m1
umulxhi $nj, $m1, $nj ! nhi=nj
ba .L1st
sub $num, 24, $cnt ! cnt=num-3
.align 16
.L1st:
ld [$ap+0], $t0 ! ap[j]
addcc $alo, $hi0, $lo0
ld [$ap+4], $t1
addxc $aj, %g0, $hi0
sllx $t1, 32, $aj
add $ap, 8, $ap
or $t0, $aj, $aj
stx $aj, [$anp] ! converted ap[j]
ld [$np+0], $t2 ! np[j]
addcc $nlo, $hi1, $lo1
ld [$np+4], $t3
addxc $nj, %g0, $hi1 ! nhi=nj
sllx $t3, 32, $nj
add $np, 8, $np
mulx $aj, $m0, $alo ! ap[j]*bp[0]
or $t2, $nj, $nj
umulxhi $aj, $m0, $aj ! ahi=aj
stx $nj, [$anp+8] ! converted np[j]
add $anp, 16, $anp ! anp++
mulx $nj, $m1, $nlo ! np[j]*m1
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
umulxhi $nj, $m1, $nj ! nhi=nj
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp ! tp++
brnz,pt $cnt, .L1st
sub $cnt, 8, $cnt ! j--
!.L1st
addcc $alo, $hi0, $lo0
addxc $aj, %g0, $hi0 ! ahi=aj
addcc $nlo, $hi1, $lo1
addxc $nj, %g0, $hi1
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp
addcc $hi0, $hi1, $hi1
addxc %g0, %g0, $ovf ! upmost overflow bit
stx $hi1, [$tp]
add $tp, 8, $tp
ba .Louter
sub $num, 16, $i ! i=num-2
.align 16
.Louter:
ld [$bp+0], $t2 ! m0=bp[i]
ld [$bp+4], $t3
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
add $bp, 8, $bp
sllx $t3, 32, $m0
ldx [$anp+0], $aj ! ap[0]
or $t2, $m0, $m0
ldx [$anp+8], $nj ! np[0]
mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
ldx [$tp], $tj ! tp[0]
umulxhi $aj, $m0, $hi0
ldx [$anp+16], $aj ! ap[1]
addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
mulx $aj, $m0, $alo ! ap[1]*bp[i]
addxc %g0, $hi0, $hi0
mulx $lo0, $n0, $m1 ! tp[0]*n0
umulxhi $aj, $m0, $aj ! ahi=aj
mulx $nj, $m1, $lo1 ! np[0]*m1
umulxhi $nj, $m1, $hi1
ldx [$anp+24], $nj ! np[1]
add $anp, 32, $anp
addcc $lo1, $lo0, $lo1
mulx $nj, $m1, $nlo ! np[1]*m1
addxc %g0, $hi1, $hi1
umulxhi $nj, $m1, $nj ! nhi=nj
ba .Linner
sub $num, 24, $cnt ! cnt=num-3
.align 16
.Linner:
addcc $alo, $hi0, $lo0
ldx [$tp+8], $tj ! tp[j]
addxc $aj, %g0, $hi0 ! ahi=aj
ldx [$anp+0], $aj ! ap[j]
addcc $nlo, $hi1, $lo1
mulx $aj, $m0, $alo ! ap[j]*bp[i]
addxc $nj, %g0, $hi1 ! nhi=nj
ldx [$anp+8], $nj ! np[j]
add $anp, 16, $anp
umulxhi $aj, $m0, $aj ! ahi=aj
addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
mulx $nj, $m1, $nlo ! np[j]*m1
addxc %g0, $hi0, $hi0
umulxhi $nj, $m1, $nj ! nhi=nj
addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp
brnz,pt $cnt, .Linner
sub $cnt, 8, $cnt
!.Linner
ldx [$tp+8], $tj ! tp[j]
addcc $alo, $hi0, $lo0
addxc $aj, %g0, $hi0 ! ahi=aj
addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
addxc %g0, $hi0, $hi0
addcc $nlo, $hi1, $lo1
addxc $nj, %g0, $hi1 ! nhi=nj
addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
addxccc $hi1, $hi0, $hi1
addxc %g0, %g0, $ovf
stx $hi1, [$tp+8]
add $tp, 16, $tp
brnz,pt $i, .Louter
sub $i, 8, $i
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
ba .Lsub
subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
.align 16
.Lsub:
ldx [$tp], $tj
add $tp, 8, $tp
ldx [$anp+8], $nj
add $anp, 16, $anp
subccc $tj, $nj, $t2 ! tp[j]-np[j]
srlx $tj, 32, $tj
srlx $nj, 32, $nj
subccc $tj, $nj, $t3
add $rp, 8, $rp
st $t2, [$rp-4] ! reverse order
st $t3, [$rp-8]
brnz,pt $cnt, .Lsub
sub $cnt, 8, $cnt
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
sub $rp, $num, $rp
subc $ovf, %g0, $ovf ! handle upmost overflow bit
and $tp, $ovf, $ap
andn $rp, $ovf, $np
or $np, $ap, $ap ! ap=borrow?tp:rp
ba .Lcopy
sub $num, 8, $cnt
.align 16
.Lcopy: ! copy or in-place refresh
ld [$ap+0], $t2
ld [$ap+4], $t3
add $ap, 8, $ap
stx %g0, [$tp] ! zap
add $tp, 8, $tp
stx %g0, [$anp] ! zap
stx %g0, [$anp+8]
add $anp, 16, $anp
st $t3, [$rp+0] ! flip order
st $t2, [$rp+4]
add $rp, 8, $rp
brnz $cnt, .Lcopy
sub $cnt, 8, $cnt
mov 1, %o0
ret
restore
.type bn_mul_mont_vis3, #function
.size bn_mul_mont_vis3, .-bn_mul_mont_vis3
.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis3 {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my ($ref,$opf);
my %visopf = ( "addxc" => 0x011,
"addxccc" => 0x013,
"umulxhi" => 0x016 );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%([goli])([0-9])/);
$_=$bias{$1}+$2;
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unvis3($1,$2,$3,$4)
/ge;
print $_,"\n";
}
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,313 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has three code paths: pure integer
# code suitable for any x86 CPU, MMX code suitable for PIII and later
# and PCLMULQDQ suitable for Westmere and later. Improvement varies
# from one benchmark and µ-arch to another. Below are interval values
# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
# code:
#
# PIII 16%-30%
# P4 12%-12%
# Opteron 18%-40%
# Core2 19%-44%
# Atom 38%-64%
# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
#
# Note that above improvement coefficients are not coefficients for
# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
# is more and more dominated by other subroutines, most notably by
# BN_GF2m_mod[_mul]_arr...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
$a="eax";
$b="ebx";
($a1,$a2,$a4)=("ecx","edx","ebp");
$R="mm0";
@T=("mm1","mm2");
($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
@i=("esi","edi");
if (!$x86only) {
&function_begin_B("_mul_1x1_mmx");
&sub ("esp",32+4);
&mov ($a1,$a);
&lea ($a2,&DWP(0,$a,$a));
&and ($a1,0x3fffffff);
&lea ($a4,&DWP(0,$a2,$a2));
&mov (&DWP(0*4,"esp"),0);
&and ($a2,0x7fffffff);
&movd ($A,$a);
&movd ($B,$b);
&mov (&DWP(1*4,"esp"),$a1); # a1
&xor ($a1,$a2); # a1^a2
&pxor ($B31,$B31);
&pxor ($B30,$B30);
&mov (&DWP(2*4,"esp"),$a2); # a2
&xor ($a2,$a4); # a2^a4
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
&pcmpgtd($B31,$A); # broadcast 31st bit
&paddd ($A,$A); # $A<<=1
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
&mov (&DWP(4*4,"esp"),$a4); # a4
&xor ($a4,$a2); # a2=a4^a2^a4
&pand ($B31,$B);
&pcmpgtd($B30,$A); # broadcast 30th bit
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
&xor ($a4,$a1); # a1^a2^a4
&psllq ($B31,31);
&pand ($B30,$B);
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
&mov (@i[0],0x7);
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
&mov ($a4,@i[0]);
&and (@i[0],$b);
&shr ($b,3);
&mov (@i[1],$a4);
&psllq ($B30,30);
&and (@i[1],$b);
&shr ($b,3);
&movd ($R,&DWP(0,"esp",@i[0],4));
&mov (@i[0],$a4);
&and (@i[0],$b);
&shr ($b,3);
for($n=1;$n<9;$n++) {
&movd (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@i[1],$a4);
&psllq (@T[1],3*$n);
&and (@i[1],$b);
&shr ($b,3);
&pxor ($R,@T[1]);
push(@i,shift(@i)); push(@T,shift(@T));
}
&movd (@T[1],&DWP(0,"esp",@i[1],4));
&pxor ($R,$B30);
&psllq (@T[1],3*$n++);
&pxor ($R,@T[1]);
&movd (@T[0],&DWP(0,"esp",@i[0],4));
&pxor ($R,$B31);
&psllq (@T[0],3*$n);
&add ("esp",32+4);
&pxor ($R,@T[0]);
&ret ();
&function_end_B("_mul_1x1_mmx");
}
($lo,$hi)=("eax","edx");
@T=("ecx","ebp");
&function_begin_B("_mul_1x1_ialu");
&sub ("esp",32+4);
&mov ($a1,$a);
&lea ($a2,&DWP(0,$a,$a));
&lea ($a4,&DWP(0,"",$a,4));
&and ($a1,0x3fffffff);
&lea (@i[1],&DWP(0,$lo,$lo));
&sar ($lo,31); # broadcast 31st bit
&mov (&DWP(0*4,"esp"),0);
&and ($a2,0x7fffffff);
&mov (&DWP(1*4,"esp"),$a1); # a1
&xor ($a1,$a2); # a1^a2
&mov (&DWP(2*4,"esp"),$a2); # a2
&xor ($a2,$a4); # a2^a4
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
&mov (&DWP(4*4,"esp"),$a4); # a4
&xor ($a4,$a2); # a2=a4^a2^a4
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
&xor ($a4,$a1); # a1^a2^a4
&sar (@i[1],31); # broardcast 30th bit
&and ($lo,$b);
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
&and (@i[1],$b);
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
&mov ($hi,$lo);
&shl ($lo,31);
&mov (@T[0],@i[1]);
&shr ($hi,1);
&mov (@i[0],0x7);
&shl (@i[1],30);
&and (@i[0],$b);
&shr (@T[0],2);
&xor ($lo,@i[1]);
&shr ($b,3);
&mov (@i[1],0x7); # 5-byte instruction!?
&and (@i[1],$b);
&shr ($b,3);
&xor ($hi,@T[0]);
&xor ($lo,&DWP(0,"esp",@i[0],4));
&mov (@i[0],0x7);
&and (@i[0],$b);
&shr ($b,3);
for($n=1;$n<9;$n++) {
&mov (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@i[1],0x7);
&mov (@T[0],@T[1]);
&shl (@T[1],3*$n);
&and (@i[1],$b);
&shr (@T[0],32-3*$n);
&xor ($lo,@T[1]);
&shr ($b,3);
&xor ($hi,@T[0]);
push(@i,shift(@i)); push(@T,shift(@T));
}
&mov (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@T[0],@T[1]);
&shl (@T[1],3*$n);
&mov (@i[1],&DWP(0,"esp",@i[0],4));
&shr (@T[0],32-3*$n); $n++;
&mov (@i[0],@i[1]);
&xor ($lo,@T[1]);
&shl (@i[1],3*$n);
&xor ($hi,@T[0]);
&shr (@i[0],32-3*$n);
&xor ($lo,@i[1]);
&xor ($hi,@i[0]);
&add ("esp",32+4);
&ret ();
&function_end_B("_mul_1x1_ialu");
# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
&function_begin_B("bn_GF2m_mul_2x2");
if (!$x86only) {
&picmeup("edx","OPENSSL_ia32cap_P");
&mov ("eax",&DWP(0,"edx"));
&mov ("edx",&DWP(4,"edx"));
&test ("eax",1<<23); # check MMX bit
&jz (&label("ialu"));
if ($sse2) {
&test ("eax",1<<24); # check FXSR bit
&jz (&label("mmx"));
&test ("edx",1<<1); # check PCLMULQDQ bit
&jz (&label("mmx"));
&movups ("xmm0",&QWP(8,"esp"));
&shufps ("xmm0","xmm0",0b10110001);
&pclmulqdq ("xmm0","xmm0",1);
&mov ("eax",&DWP(4,"esp"));
&movups (&QWP(0,"eax"),"xmm0");
&ret ();
&set_label("mmx",16);
}
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_mmx"); # a1·b1
&movq ("mm7",$R);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # a0·b0
&movq ("mm6",$R);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
&pxor ($R,"mm7");
&mov ($a,&wparam(0));
&pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
&movq ($A,$R);
&psllq ($R,32);
&pop ("edi");
&psrlq ($A,32);
&pop ("esi");
&pxor ($R,"mm6");
&pop ("ebx");
&pxor ($A,"mm7");
&movq (&QWP(0,$a),$R);
&pop ("ebp");
&movq (&QWP(8,$a),$A);
&emms ();
&ret ();
&set_label("ialu",16);
}
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&stack_push(4+1);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_ialu"); # a1·b1
&mov (&DWP(8,"esp"),$lo);
&mov (&DWP(12,"esp"),$hi);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # a0·b0
&mov (&DWP(0,"esp"),$lo);
&mov (&DWP(4,"esp"),$hi);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
&mov ("ebp",&wparam(0));
@r=("ebx","ecx","edi","esi");
&mov (@r[0],&DWP(0,"esp"));
&mov (@r[1],&DWP(4,"esp"));
&mov (@r[2],&DWP(8,"esp"));
&mov (@r[3],&DWP(12,"esp"));
&xor ($lo,$hi);
&xor ($hi,@r[1]);
&xor ($lo,@r[0]);
&mov (&DWP(0,"ebp"),@r[0]);
&xor ($hi,@r[2]);
&mov (&DWP(12,"ebp"),@r[3]);
&xor ($lo,@r[3]);
&stack_pop(4+1);
&xor ($hi,@r[3]);
&pop ("edi");
&xor ($lo,$hi);
&pop ("esi");
&mov (&DWP(8,"ebp"),$hi);
&pop ("ebx");
&mov (&DWP(4,"ebp"),$lo);
&pop ("ebp");
&ret ();
&function_end_B("bn_GF2m_mul_2x2");
&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -0,0 +1,593 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&function_begin("bn_mul_mont");
$i="edx";
$j="ecx";
$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
$_num=&DWP(4*0,"esp"); # stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
&xor ("eax","eax");
&mov ("edi",&wparam(5)); # int num
&cmp ("edi",4);
&jl (&label("just_leave"));
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&mov ("ebp","esp"); # saved stack pointer!
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
&lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arraning 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov ("eax","esp");
&sub ("eax","edx");
&and ("eax",2047);
&sub ("esp","eax"); # this aligns sp and ap modulo 2048
&xor ("edx","esp");
&and ("edx",2048);
&xor ("edx",2048);
&sub ("esp","edx"); # this splits them apart modulo 4096
&and ("esp",-64); # align to cache line
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
&mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
&mov ($_np,"edx");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
&mov ($_sp,"ebp"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";
&picmeup("eax","OPENSSL_ia32cap_P");
&bt (&DWP(0,"eax"),26);
&jnc (&label("non_sse2"));
&mov ("eax",-1);
&movd ($mask,"eax"); # mask 32 lower bits
&mov ($ap,$_ap); # load input pointers
&mov ($bp,$_bp);
&mov ($np,$_np);
&xor ($i,$i); # i=0
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp)); # bp[0]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
&movq ($car0,$mul1);
&movq ($acc0,$mul1); # I wish movd worked for
&pand ($acc0,$mask); # inter-register transfers
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
&paddq ($car1,$acc0);
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&inc ($j); # j++
&set_label("1st",16);
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
&psrlq ($car1,32);
&lea ($j,&DWP(1,$j));
&cmp ($j,$num);
&jl (&label("1st"));
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car1,$car0);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&inc ($i); # i++
&set_label("outer");
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($temp,&DWP($frame,"esp")); # tp[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
&paddq ($mul1,$temp); # +=tp[0]
&movq ($acc0,$mul1);
&movq ($car0,$mul1);
&pand ($acc0,$mask);
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1);
&paddq ($car1,$acc0);
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[1]
&inc ($j); # j++
&dec ($num);
&set_label("inner");
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[j+1]
&dec ($num);
&lea ($j,&DWP(1,$j)); # j++
&jnz (&label("inner"));
&mov ($num,$j);
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
&paddq ($car1,$car0);
&paddq ($car1,$temp);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&lea ($i,&DWP(1,$i)); # i++
&cmp ($i,$num);
&jle (&label("outer"));
&emms (); # done with mmx bank
&jmp (&label("common_tail"));
&set_label("non_sse2",16);
}
if (0) {
&mov ("esp",$_sp);
&xor ("eax","eax"); # signal "not fast enough [yet]"
&jmp (&label("just_leave"));
# While the below code provides competitive performance for
# all key lengthes on modern Intel cores, it's still more
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
# means compared to the original integer-only assembler.
# 512-bit RSA sign is better by ~40%, but that's about all
# one can say about all CPUs...
} else {
$inp="esi"; # integer path uses these registers differently
$word="edi";
$carry="ebp";
&mov ($inp,$_ap);
&lea ($carry,&DWP(1,$num));
&mov ($word,$_bp);
&xor ($j,$j); # j=0
&mov ("edx",$inp);
&and ($carry,1); # see if num is even
&sub ("edx",$word); # see if ap==bp
&lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
&or ($carry,"edx");
&mov ($word,&DWP(0,$word)); # bp[0]
&jz (&label("bn_sqr_mont"));
&mov ($_bpend,"eax");
&mov ("eax",&DWP(0,$inp));
&xor ("edx","edx");
&set_label("mull",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[0]
&add ($carry,"eax");
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("mull"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[0]
&mov ($word,$_n0);
&add ("eax",$carry);
&mov ($inp,$_np);
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
&xor ($j,$j);
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mov ("eax",&DWP(0,$inp)); # np[0]
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&inc ($j);
&jmp (&label("2ndmadd"));
&set_label("1stmadd",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[i]
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("1stmadd"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[i]
&add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&add ($carry,"eax");
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&xor ($j,$j);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
&adc ($j,0);
&mov ("eax",&DWP(0,$inp)); # np[0]
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&mov ($j,1);
&set_label("2ndmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
&jl (&label("2ndmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&xor ("eax","eax");
&mov ($j,$_bp); # &bp[i]
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&lea ($j,&DWP(4,$j));
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$_bpend);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(0,$j)); # bp[i+1]
&mov ($inp,$_ap);
&mov ($_bp,$j); # &bp[++i]
&xor ($j,$j);
&xor ("edx","edx");
&mov ("eax",&DWP(0,$inp));
&jmp (&label("1stmadd"));
&set_label("bn_sqr_mont",16);
$sbit=$num;
&mov ($_num,$num);
&mov ($_bp,$j); # i=0
&mov ("eax",$word); # ap[0]
&mul ($word); # ap[0]*ap[0]
&mov (&DWP($frame,"esp"),"eax"); # tp[0]=
&mov ($sbit,"edx");
&shr ("edx",1);
&and ($sbit,1);
&inc ($j);
&set_label("sqr",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[0]
&add ("eax",$carry);
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&shr ("eax",31);
&cmp ($j,$_num);
&mov ($sbit,"eax");
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("sqr"));
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*ap[0]
&add ("eax",$carry);
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&shr ("eax",31);
&mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
&lea ($carry,&DWP(0,"eax","edx",2));
&mov ("eax",&DWP(0,$inp)); # np[0]
&shr ("edx",31);
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
&mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ($num,$j);
&adc ("edx",0);
&mov ("eax",&DWP(4,$inp)); # np[1]
&mov ($j,1);
&set_label("3rdmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
&mov ($carry,"edx");
&mul ($word); # np[j+1]*m
&add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
&lea ($j,&DWP(2,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
&jl (&label("3rdmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&mov ($j,$_bp); # i
&xor ("eax","eax");
&mov ($inp,$_ap);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$num);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
&lea ($j,&DWP(1,$j));
&mov ("eax",$word);
&mov ($_bp,$j); # ++i
&mul ($word); # ap[i]*ap[i]
&add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
&adc ("edx",0);
&mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
&xor ($carry,$carry);
&cmp ($j,$num);
&lea ($j,&DWP(1,$j));
&je (&label("sqrlast"));
&mov ($sbit,"edx"); # zaps $num
&shr ("edx",1);
&and ($sbit,1);
&set_label("sqradd",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[i]
&add ("eax",$carry);
&lea ($carry,&DWP(0,"eax","eax"));
&adc ("edx",0);
&shr ("eax",31);
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("eax",0);
&add ($carry,$sbit);
&adc ("eax",0);
&cmp ($j,$_num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&mov ($sbit,"eax");
&jle (&label("sqradd"));
&mov ($carry,"edx");
&add ("edx","edx");
&shr ($carry,31);
&add ("edx",$sbit);
&adc ($carry,0);
&set_label("sqrlast");
&mov ($word,$_n0);
&mov ($inp,$_np);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
&mov ("eax",&DWP(0,$inp)); # np[0]
&adc ($carry,0);
&mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&lea ($num,&DWP(-1,$j));
&adc ("edx",0);
&mov ($j,1);
&mov ("eax",&DWP(4,$inp)); # np[1]
&jmp (&label("3rdmadd"));
}
&set_label("common_tail",16);
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
&xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
&dec ($j); # doesn't affect CF!
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
&lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
&and ($tp,"eax");
&not ("eax");
&mov ($np,$rp);
&and ($np,"eax");
&or ($tp,$np); # tp=carry?tp:rp
&set_label("copy",16); # copy or in-place refresh
&mov ("eax",&DWP(0,$tp,$num,4));
&mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
&dec ($num);
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&set_label("just_leave");
&function_end("bn_mul_mont");
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -0,0 +1,28 @@
#!/usr/local/bin/perl
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
require("x86/mul_add.pl");
require("x86/mul.pl");
require("x86/sqr.pl");
require("x86/div.pl");
require("x86/add.pl");
require("x86/sub.pl");
require("x86/comba.pl");
&asm_init($ARGV[0],$0);
&bn_mul_add_words("bn_mul_add_words");
&bn_mul_words("bn_mul_words");
&bn_sqr_words("bn_sqr_words");
&bn_div_words("bn_div_words");
&bn_add_words("bn_add_words");
&bn_sub_words("bn_sub_words");
&bn_mul_comba("bn_mul_comba8",8);
&bn_mul_comba("bn_mul_comba4",4);
&bn_sqr_comba("bn_sqr_comba8",8);
&bn_sqr_comba("bn_sqr_comba4",4);
&asm_finish();

View File

@@ -0,0 +1,76 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *a
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
1;

View File

@@ -0,0 +1,277 @@
#!/usr/local/bin/perl
# x86 assember
sub mul_add_c
{
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("mul a[$ai]*b[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
&mul("edx");
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
sub sqr_add_c
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
sub sqr_add_c2
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$a,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add("eax","eax");
###
&adc("edx","edx");
###
&adc($c2,0);
&add($c0,"eax");
&adc($c1,"edx");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
&adc($c2,0);
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
###
}
sub bn_mul_comba
{
local($name,$num)=@_;
local($a,$b,$c0,$c1,$c2);
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($tot,$end);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
&push("esi");
&mov($a,&wparam(1));
&push("edi");
&mov($b,&wparam(2));
&push("ebp");
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($j+1) == $end)
{
$v=1;
$v=2 if (($i+1) == $tot);
}
else
{ $v=0; }
if (($j+1) != $end)
{
$na=($ai-1);
$nb=($bi+1);
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
if ($v)
{
&comment("saved r[$i]");
# &mov("eax",&wparam(0));
# &mov(&DWP($i*4,"eax","",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&comment("save r[$i]");
# &mov("eax",&wparam(0));
&mov(&DWP($i*4,"eax","",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
sub bn_sqr_comba
{
local($name,$num)=@_;
local($r,$a,$c0,$c1,$c2)=@_;
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($b,$tot,$end,$half);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$r="edi";
&push("esi");
&push("edi");
&push("ebp");
&push("ebx");
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&xor($c0,$c0);
&xor($c1,$c1);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("############### Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($ai-1) < ($bi+1))
{
$v=1;
$v=2 if ($i+1) == $tot;
}
else
{ $v=0; }
if (!$v)
{
$na=$ai-1;
$nb=$bi+1;
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
if ($ai == $bi)
{
&sqr_add_c($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
else
{
&sqr_add_c2($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
if ($v)
{
&comment("saved r[$i]");
#&mov(&DWP($i*4,$r,"",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
last;
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&mov(&DWP($i*4,$r,"",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
1;

View File

@@ -0,0 +1,15 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_div_words
{
local($name)=@_;
&function_begin($name,"");
&mov("edx",&wparam(0)); #
&mov("eax",&wparam(1)); #
&mov("ebx",&wparam(2)); #
&div("ebx");
&function_end($name);
}
1;

View File

@@ -0,0 +1,3 @@
#!/usr/local/bin/perl
# x86 assember

View File

@@ -0,0 +1,77 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_mul_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ecx";
$r="edi";
$c="esi";
$num="ebp";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&mov($w,&wparam(3)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("mw_finish"));
&set_label("mw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jz(&label("mw_finish"));
&jmp(&label("mw_loop"));
&set_label("mw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jnz(&label("mw_finish2"));
&jmp(&label("mw_end"));
&set_label("mw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
&mov($c,"edx"); # c= H(t);
&dec($num) if ($i != 7-1);
&jz(&label("mw_end")) if ($i != 7-1);
}
&set_label("mw_end",0);
&mov("eax",$c);
&function_end($name);
}
1;

View File

@@ -0,0 +1,87 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_mul_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ebp";
$r="edi";
$c="esi";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov("ecx",&wparam(2)); #
&mov($a,&wparam(1)); #
&and("ecx",0xfffffff8); # num / 8
&mov($w,&wparam(3)); #
&push("ecx"); # Up the stack for a tmp variable
&jz(&label("maw_finish"));
&set_label("maw_loop",0);
&mov(&swtmp(0),"ecx"); #
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+= *r
&mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&add("eax",$c); # L(t)+=c
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&mov("ecx",&swtmp(0)); #
&add($a,32);
&add($r,32);
&sub("ecx",8);
&jnz(&label("maw_loop"));
&set_label("maw_finish",0);
&mov("ecx",&wparam(2)); # get num
&and("ecx",7);
&jnz(&label("maw_finish2")); # helps branch prediction
&jmp(&label("maw_end"));
&set_label("maw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
&mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&add("eax",$c);
&adc("edx",0); # H(t)+=carry
&dec("ecx") if ($i != 7-1);
&mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
&jz(&label("maw_end")) if ($i != 7-1);
}
&set_label("maw_end",0);
&mov("eax",$c);
&pop("ecx"); # clear variable from
&function_end($name);
}
1;

View File

@@ -0,0 +1,60 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_sqr_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$r="esi";
$a="edi";
$num="ebx";
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("sw_finish"));
&set_label("sw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*2,$r,"",0),"eax"); #
&mov(&DWP($i*2+4,$r,"",0),"edx");#
}
&comment("");
&add($a,32);
&add($r,64);
&sub($num,8);
&jnz(&label("sw_loop"));
&set_label("sw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jz(&label("sw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*8,$r,"",0),"eax"); #
&dec($num) if ($i != 7-1);
&mov(&DWP($i*8+4,$r,"",0),"edx");
&jz(&label("sw_end")) if ($i != 7-1);
}
&set_label("sw_end",0);
&function_end($name);
}
1;

View File

@@ -0,0 +1,76 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_sub_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *a
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
1;

View File

@@ -0,0 +1,638 @@
#include "../bn_lcl.h"
#if !(defined(__GNUC__) && __GNUC__>=2)
# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
#else
/*-
* x86_64 BIGNUM accelerator version 0.1, December 2002.
*
* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
* project.
*
* Rights for redistribution and usage in source and binary forms are
* granted according to the OpenSSL license. Warranty of any kind is
* disclaimed.
*
* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
* versions, like 1.0...
* A. Well, that's because this code is basically a quick-n-dirty
* proof-of-concept hack. As you can see it's implemented with
* inline assembler, which means that you're bound to GCC and that
* there might be enough room for further improvement.
*
* Q. Why inline assembler?
* A. x86_64 features own ABI which I'm not familiar with. This is
* why I decided to let the compiler take care of subroutine
* prologue/epilogue as well as register allocation. For reference.
* Win64 implements different ABI for AMD64, different from Linux.
*
* Q. How much faster does it get?
* A. 'apps/openssl speed rsa dsa' output with no-asm:
*
* sign verify sign/s verify/s
* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
* sign verify sign/s verify/s
* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
*
* 'apps/openssl speed rsa dsa' output with this module:
*
* sign verify sign/s verify/s
* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
* sign verify sign/s verify/s
* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
*
* For the reference. IA-32 assembler implementation performs
* very much like 64-bit code compiled with no-asm on the same
* machine.
*/
# if defined(_WIN64) || !defined(__LP64__)
# define BN_ULONG unsigned long long
# else
# define BN_ULONG unsigned long
# endif
# undef mul
# undef mul_add
/*-
* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
* "g"(0) let the compiler to decide where does it
* want to keep the value of zero;
*/
# define mul_add(r,a,word,carry) do { \
register BN_ULONG high,low; \
asm ("mulq %3" \
: "=a"(low),"=d"(high) \
: "a"(word),"m"(a) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(carry),"+d"(high)\
: "a"(low),"g"(0) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+m"(r),"+d"(high) \
: "r"(carry),"g"(0) \
: "cc"); \
carry=high; \
} while (0)
# define mul(r,a,word,carry) do { \
register BN_ULONG high,low; \
asm ("mulq %3" \
: "=a"(low),"=d"(high) \
: "a"(word),"g"(a) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(carry),"+d"(high)\
: "a"(low),"g"(0) \
: "cc"); \
(r)=carry, carry=high; \
} while (0)
# undef sqr
# define sqr(r0,r1,a) \
asm ("mulq %2" \
: "=a"(r0),"=d"(r1) \
: "a"(a) \
: "cc");
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
BN_ULONG w)
{
BN_ULONG c1 = 0;
if (num <= 0)
return (c1);
while (num & ~3) {
mul_add(rp[0], ap[0], w, c1);
mul_add(rp[1], ap[1], w, c1);
mul_add(rp[2], ap[2], w, c1);
mul_add(rp[3], ap[3], w, c1);
ap += 4;
rp += 4;
num -= 4;
}
if (num) {
mul_add(rp[0], ap[0], w, c1);
if (--num == 0)
return c1;
mul_add(rp[1], ap[1], w, c1);
if (--num == 0)
return c1;
mul_add(rp[2], ap[2], w, c1);
return c1;
}
return (c1);
}
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1 = 0;
if (num <= 0)
return (c1);
while (num & ~3) {
mul(rp[0], ap[0], w, c1);
mul(rp[1], ap[1], w, c1);
mul(rp[2], ap[2], w, c1);
mul(rp[3], ap[3], w, c1);
ap += 4;
rp += 4;
num -= 4;
}
if (num) {
mul(rp[0], ap[0], w, c1);
if (--num == 0)
return c1;
mul(rp[1], ap[1], w, c1);
if (--num == 0)
return c1;
mul(rp[2], ap[2], w, c1);
}
return (c1);
}
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
{
if (n <= 0)
return;
while (n & ~3) {
sqr(r[0], r[1], a[0]);
sqr(r[2], r[3], a[1]);
sqr(r[4], r[5], a[2]);
sqr(r[6], r[7], a[3]);
a += 4;
r += 8;
n -= 4;
}
if (n) {
sqr(r[0], r[1], a[0]);
if (--n == 0)
return;
sqr(r[2], r[3], a[1]);
if (--n == 0)
return;
sqr(r[4], r[5], a[2]);
}
}
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
{
BN_ULONG ret, waste;
asm("divq %4":"=a"(ret), "=d"(waste)
: "a"(l), "d"(h), "g"(d)
: "cc");
return ret;
}
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int n)
{
BN_ULONG ret;
size_t i = 0;
if (n <= 0)
return 0;
asm volatile (" subq %0,%0 \n" /* clear carry */
" jmp 1f \n"
".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" adcq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" lea 1(%2),%2 \n"
" loop 1b \n"
" sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
"+r"(i)
:"r"(rp), "r"(ap), "r"(bp)
:"cc", "memory");
return ret & 1;
}
# ifndef SIMICS
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int n)
{
BN_ULONG ret;
size_t i = 0;
if (n <= 0)
return 0;
asm volatile (" subq %0,%0 \n" /* clear borrow */
" jmp 1f \n"
".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" sbbq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" lea 1(%2),%2 \n"
" loop 1b \n"
" sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
"+r"(i)
:"r"(rp), "r"(ap), "r"(bp)
:"cc", "memory");
return ret & 1;
}
# else
/* Simics 1.4<7 has buggy sbbq:-( */
# define BN_MASK2 0xffffffffffffffffL
BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
{
BN_ULONG t1, t2;
int c = 0;
if (n <= 0)
return ((BN_ULONG)0);
for (;;) {
t1 = a[0];
t2 = b[0];
r[0] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
t1 = a[1];
t2 = b[1];
r[1] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
t1 = a[2];
t2 = b[2];
r[2] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
t1 = a[3];
t2 = b[3];
r[3] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
a += 4;
b += 4;
r += 4;
}
return (c);
}
# endif
/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
/*
* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
* c=(c2,c1,c0)
*/
/*
* Keep in mind that carrying into high part of multiplication result
* can not overflow, because it cannot be all-ones.
*/
# if 0
/* original macros are kept for reference purposes */
# define mul_add_c(a,b,c0,c1,c2) do { \
BN_ULONG ta = (a), tb = (b); \
BN_ULONG lo, hi; \
BN_UMULT_LOHI(lo,hi,ta,tb); \
c0 += lo; hi += (c0<lo)?1:0; \
c1 += hi; c2 += (c1<hi)?1:0; \
} while(0)
# define mul_add_c2(a,b,c0,c1,c2) do { \
BN_ULONG ta = (a), tb = (b); \
BN_ULONG lo, hi, tt; \
BN_UMULT_LOHI(lo,hi,ta,tb); \
c0 += lo; tt = hi+((c0<lo)?1:0); \
c1 += tt; c2 += (c1<tt)?1:0; \
c0 += lo; hi += (c0<lo)?1:0; \
c1 += hi; c2 += (c1<hi)?1:0; \
} while(0)
# define sqr_add_c(a,i,c0,c1,c2) do { \
BN_ULONG ta = (a)[i]; \
BN_ULONG lo, hi; \
BN_UMULT_LOHI(lo,hi,ta,ta); \
c0 += lo; hi += (c0<lo)?1:0; \
c1 += hi; c2 += (c1<hi)?1:0; \
} while(0)
# else
# define mul_add_c(a,b,c0,c1,c2) do { \
BN_ULONG t1,t2; \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
} while (0)
# define sqr_add_c(a,i,c0,c1,c2) do { \
BN_ULONG t1,t2; \
asm ("mulq %2" \
: "=a"(t1),"=d"(t2) \
: "a"(a[i]) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
} while (0)
# define mul_add_c2(a,b,c0,c1,c2) do { \
BN_ULONG t1,t2; \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
} while (0)
# endif
# define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
mul_add_c(a[0], b[0], c1, c2, c3);
r[0] = c1;
c1 = 0;
mul_add_c(a[0], b[1], c2, c3, c1);
mul_add_c(a[1], b[0], c2, c3, c1);
r[1] = c2;
c2 = 0;
mul_add_c(a[2], b[0], c3, c1, c2);
mul_add_c(a[1], b[1], c3, c1, c2);
mul_add_c(a[0], b[2], c3, c1, c2);
r[2] = c3;
c3 = 0;
mul_add_c(a[0], b[3], c1, c2, c3);
mul_add_c(a[1], b[2], c1, c2, c3);
mul_add_c(a[2], b[1], c1, c2, c3);
mul_add_c(a[3], b[0], c1, c2, c3);
r[3] = c1;
c1 = 0;
mul_add_c(a[4], b[0], c2, c3, c1);
mul_add_c(a[3], b[1], c2, c3, c1);
mul_add_c(a[2], b[2], c2, c3, c1);
mul_add_c(a[1], b[3], c2, c3, c1);
mul_add_c(a[0], b[4], c2, c3, c1);
r[4] = c2;
c2 = 0;
mul_add_c(a[0], b[5], c3, c1, c2);
mul_add_c(a[1], b[4], c3, c1, c2);
mul_add_c(a[2], b[3], c3, c1, c2);
mul_add_c(a[3], b[2], c3, c1, c2);
mul_add_c(a[4], b[1], c3, c1, c2);
mul_add_c(a[5], b[0], c3, c1, c2);
r[5] = c3;
c3 = 0;
mul_add_c(a[6], b[0], c1, c2, c3);
mul_add_c(a[5], b[1], c1, c2, c3);
mul_add_c(a[4], b[2], c1, c2, c3);
mul_add_c(a[3], b[3], c1, c2, c3);
mul_add_c(a[2], b[4], c1, c2, c3);
mul_add_c(a[1], b[5], c1, c2, c3);
mul_add_c(a[0], b[6], c1, c2, c3);
r[6] = c1;
c1 = 0;
mul_add_c(a[0], b[7], c2, c3, c1);
mul_add_c(a[1], b[6], c2, c3, c1);
mul_add_c(a[2], b[5], c2, c3, c1);
mul_add_c(a[3], b[4], c2, c3, c1);
mul_add_c(a[4], b[3], c2, c3, c1);
mul_add_c(a[5], b[2], c2, c3, c1);
mul_add_c(a[6], b[1], c2, c3, c1);
mul_add_c(a[7], b[0], c2, c3, c1);
r[7] = c2;
c2 = 0;
mul_add_c(a[7], b[1], c3, c1, c2);
mul_add_c(a[6], b[2], c3, c1, c2);
mul_add_c(a[5], b[3], c3, c1, c2);
mul_add_c(a[4], b[4], c3, c1, c2);
mul_add_c(a[3], b[5], c3, c1, c2);
mul_add_c(a[2], b[6], c3, c1, c2);
mul_add_c(a[1], b[7], c3, c1, c2);
r[8] = c3;
c3 = 0;
mul_add_c(a[2], b[7], c1, c2, c3);
mul_add_c(a[3], b[6], c1, c2, c3);
mul_add_c(a[4], b[5], c1, c2, c3);
mul_add_c(a[5], b[4], c1, c2, c3);
mul_add_c(a[6], b[3], c1, c2, c3);
mul_add_c(a[7], b[2], c1, c2, c3);
r[9] = c1;
c1 = 0;
mul_add_c(a[7], b[3], c2, c3, c1);
mul_add_c(a[6], b[4], c2, c3, c1);
mul_add_c(a[5], b[5], c2, c3, c1);
mul_add_c(a[4], b[6], c2, c3, c1);
mul_add_c(a[3], b[7], c2, c3, c1);
r[10] = c2;
c2 = 0;
mul_add_c(a[4], b[7], c3, c1, c2);
mul_add_c(a[5], b[6], c3, c1, c2);
mul_add_c(a[6], b[5], c3, c1, c2);
mul_add_c(a[7], b[4], c3, c1, c2);
r[11] = c3;
c3 = 0;
mul_add_c(a[7], b[5], c1, c2, c3);
mul_add_c(a[6], b[6], c1, c2, c3);
mul_add_c(a[5], b[7], c1, c2, c3);
r[12] = c1;
c1 = 0;
mul_add_c(a[6], b[7], c2, c3, c1);
mul_add_c(a[7], b[6], c2, c3, c1);
r[13] = c2;
c2 = 0;
mul_add_c(a[7], b[7], c3, c1, c2);
r[14] = c3;
r[15] = c1;
}
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
mul_add_c(a[0], b[0], c1, c2, c3);
r[0] = c1;
c1 = 0;
mul_add_c(a[0], b[1], c2, c3, c1);
mul_add_c(a[1], b[0], c2, c3, c1);
r[1] = c2;
c2 = 0;
mul_add_c(a[2], b[0], c3, c1, c2);
mul_add_c(a[1], b[1], c3, c1, c2);
mul_add_c(a[0], b[2], c3, c1, c2);
r[2] = c3;
c3 = 0;
mul_add_c(a[0], b[3], c1, c2, c3);
mul_add_c(a[1], b[2], c1, c2, c3);
mul_add_c(a[2], b[1], c1, c2, c3);
mul_add_c(a[3], b[0], c1, c2, c3);
r[3] = c1;
c1 = 0;
mul_add_c(a[3], b[1], c2, c3, c1);
mul_add_c(a[2], b[2], c2, c3, c1);
mul_add_c(a[1], b[3], c2, c3, c1);
r[4] = c2;
c2 = 0;
mul_add_c(a[2], b[3], c3, c1, c2);
mul_add_c(a[3], b[2], c3, c1, c2);
r[5] = c3;
c3 = 0;
mul_add_c(a[3], b[3], c1, c2, c3);
r[6] = c1;
r[7] = c2;
}
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
sqr_add_c(a, 0, c1, c2, c3);
r[0] = c1;
c1 = 0;
sqr_add_c2(a, 1, 0, c2, c3, c1);
r[1] = c2;
c2 = 0;
sqr_add_c(a, 1, c3, c1, c2);
sqr_add_c2(a, 2, 0, c3, c1, c2);
r[2] = c3;
c3 = 0;
sqr_add_c2(a, 3, 0, c1, c2, c3);
sqr_add_c2(a, 2, 1, c1, c2, c3);
r[3] = c1;
c1 = 0;
sqr_add_c(a, 2, c2, c3, c1);
sqr_add_c2(a, 3, 1, c2, c3, c1);
sqr_add_c2(a, 4, 0, c2, c3, c1);
r[4] = c2;
c2 = 0;
sqr_add_c2(a, 5, 0, c3, c1, c2);
sqr_add_c2(a, 4, 1, c3, c1, c2);
sqr_add_c2(a, 3, 2, c3, c1, c2);
r[5] = c3;
c3 = 0;
sqr_add_c(a, 3, c1, c2, c3);
sqr_add_c2(a, 4, 2, c1, c2, c3);
sqr_add_c2(a, 5, 1, c1, c2, c3);
sqr_add_c2(a, 6, 0, c1, c2, c3);
r[6] = c1;
c1 = 0;
sqr_add_c2(a, 7, 0, c2, c3, c1);
sqr_add_c2(a, 6, 1, c2, c3, c1);
sqr_add_c2(a, 5, 2, c2, c3, c1);
sqr_add_c2(a, 4, 3, c2, c3, c1);
r[7] = c2;
c2 = 0;
sqr_add_c(a, 4, c3, c1, c2);
sqr_add_c2(a, 5, 3, c3, c1, c2);
sqr_add_c2(a, 6, 2, c3, c1, c2);
sqr_add_c2(a, 7, 1, c3, c1, c2);
r[8] = c3;
c3 = 0;
sqr_add_c2(a, 7, 2, c1, c2, c3);
sqr_add_c2(a, 6, 3, c1, c2, c3);
sqr_add_c2(a, 5, 4, c1, c2, c3);
r[9] = c1;
c1 = 0;
sqr_add_c(a, 5, c2, c3, c1);
sqr_add_c2(a, 6, 4, c2, c3, c1);
sqr_add_c2(a, 7, 3, c2, c3, c1);
r[10] = c2;
c2 = 0;
sqr_add_c2(a, 7, 4, c3, c1, c2);
sqr_add_c2(a, 6, 5, c3, c1, c2);
r[11] = c3;
c3 = 0;
sqr_add_c(a, 6, c1, c2, c3);
sqr_add_c2(a, 7, 5, c1, c2, c3);
r[12] = c1;
c1 = 0;
sqr_add_c2(a, 7, 6, c2, c3, c1);
r[13] = c2;
c2 = 0;
sqr_add_c(a, 7, c3, c1, c2);
r[14] = c3;
r[15] = c1;
}
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
sqr_add_c(a, 0, c1, c2, c3);
r[0] = c1;
c1 = 0;
sqr_add_c2(a, 1, 0, c2, c3, c1);
r[1] = c2;
c2 = 0;
sqr_add_c(a, 1, c3, c1, c2);
sqr_add_c2(a, 2, 0, c3, c1, c2);
r[2] = c3;
c3 = 0;
sqr_add_c2(a, 3, 0, c1, c2, c3);
sqr_add_c2(a, 2, 1, c1, c2, c3);
r[3] = c1;
c1 = 0;
sqr_add_c(a, 2, c2, c3, c1);
sqr_add_c2(a, 3, 1, c2, c3, c1);
r[4] = c2;
c2 = 0;
sqr_add_c2(a, 3, 2, c3, c1, c2);
r[5] = c3;
c3 = 0;
sqr_add_c(a, 3, c1, c2, c3);
r[6] = c1;
r[7] = c2;
}
#endif

View File

@@ -0,0 +1,390 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: code suitable
# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
# later. Improvement varies from one benchmark and µ-arch to another.
# Vanilla code path is at most 20% faster than compiler-generated code
# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
# all CPU time is burnt in it...
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
($lo,$hi)=("%rax","%rdx"); $a=$lo;
($i0,$i1)=("%rsi","%rdi");
($t0,$t1)=("%rbx","%rcx");
($b,$mask)=("%rbp","%r8");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
($R,$Tx)=("%xmm0","%xmm1");
$code.=<<___;
.text
.type _mul_1x1,\@abi-omnipotent
.align 16
_mul_1x1:
sub \$128+8,%rsp
mov \$-1,$a1
lea ($a,$a),$i0
shr \$3,$a1
lea (,$a,4),$i1
and $a,$a1 # a1=a&0x1fffffffffffffff
lea (,$a,8),$a8
sar \$63,$a # broadcast 63rd bit
lea ($a1,$a1),$a2
sar \$63,$i0 # broadcast 62nd bit
lea (,$a1,4),$a4
and $b,$a
sar \$63,$i1 # boardcast 61st bit
mov $a,$hi # $a is $lo
shl \$63,$lo
and $b,$i0
shr \$1,$hi
mov $i0,$t1
shl \$62,$i0
and $b,$i1
shr \$2,$t1
xor $i0,$lo
mov $i1,$t0
shl \$61,$i1
xor $t1,$hi
shr \$3,$t0
xor $i1,$lo
xor $t0,$hi
mov $a1,$a12
movq \$0,0(%rsp) # tab[0]=0
xor $a2,$a12 # a1^a2
mov $a1,8(%rsp) # tab[1]=a1
mov $a4,$a48
mov $a2,16(%rsp) # tab[2]=a2
xor $a8,$a48 # a4^a8
mov $a12,24(%rsp) # tab[3]=a1^a2
xor $a4,$a1
mov $a4,32(%rsp) # tab[4]=a4
xor $a4,$a2
mov $a1,40(%rsp) # tab[5]=a1^a4
xor $a4,$a12
mov $a2,48(%rsp) # tab[6]=a2^a4
xor $a48,$a1 # a1^a4^a4^a8=a1^a8
mov $a12,56(%rsp) # tab[7]=a1^a2^a4
xor $a48,$a2 # a2^a4^a4^a8=a1^a8
mov $a8,64(%rsp) # tab[8]=a8
xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
mov $a1,72(%rsp) # tab[9]=a1^a8
xor $a4,$a1 # a1^a8^a4
mov $a2,80(%rsp) # tab[10]=a2^a8
xor $a4,$a2 # a2^a8^a4
mov $a12,88(%rsp) # tab[11]=a1^a2^a8
xor $a4,$a12 # a1^a2^a8^a4
mov $a48,96(%rsp) # tab[12]=a4^a8
mov $mask,$i0
mov $a1,104(%rsp) # tab[13]=a1^a4^a8
and $b,$i0
mov $a2,112(%rsp) # tab[14]=a2^a4^a8
shr \$4,$b
mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
mov $mask,$i1
and $b,$i1
shr \$4,$b
movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
mov $mask,$i0
and $b,$i0
shr \$4,$b
___
for ($n=1;$n<8;$n++) {
$code.=<<___;
mov (%rsp,$i1,8),$t1
mov $mask,$i1
mov $t1,$t0
shl \$`8*$n-4`,$t1
and $b,$i1
movq (%rsp,$i0,8),$Tx
shr \$`64-(8*$n-4)`,$t0
xor $t1,$lo
pslldq \$$n,$Tx
mov $mask,$i0
shr \$4,$b
xor $t0,$hi
and $b,$i0
shr \$4,$b
pxor $Tx,$R
___
}
$code.=<<___;
mov (%rsp,$i1,8),$t1
mov $t1,$t0
shl \$`8*$n-4`,$t1
movq $R,$i0
shr \$`64-(8*$n-4)`,$t0
xor $t1,$lo
psrldq \$8,$R
xor $t0,$hi
movq $R,$i1
xor $i0,$lo
xor $i1,$hi
add \$128+8,%rsp
ret
.Lend_mul_1x1:
.size _mul_1x1,.-_mul_1x1
___
($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
$code.=<<___;
.extern OPENSSL_ia32cap_P
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,\@abi-omnipotent
.align 16
bn_GF2m_mul_2x2:
mov OPENSSL_ia32cap_P(%rip),%rax
bt \$33,%rax
jnc .Lvanilla_mul_2x2
movq $a1,%xmm0
movq $b1,%xmm1
movq $a0,%xmm2
___
$code.=<<___ if ($win64);
movq 40(%rsp),%xmm3
___
$code.=<<___ if (!$win64);
movq $b0,%xmm3
___
$code.=<<___;
movdqa %xmm0,%xmm4
movdqa %xmm1,%xmm5
pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
pxor %xmm2,%xmm4
pxor %xmm3,%xmm5
pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
xorps %xmm0,%xmm4
xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
movdqa %xmm4,%xmm5
pslldq \$8,%xmm4
psrldq \$8,%xmm5
pxor %xmm4,%xmm2
pxor %xmm5,%xmm0
movdqu %xmm2,0($rp)
movdqu %xmm0,16($rp)
ret
.align 16
.Lvanilla_mul_2x2:
lea -8*17(%rsp),%rsp
___
$code.=<<___ if ($win64);
mov `8*17+40`(%rsp),$b0
mov %rdi,8*15(%rsp)
mov %rsi,8*16(%rsp)
___
$code.=<<___;
mov %r14,8*10(%rsp)
mov %r13,8*11(%rsp)
mov %r12,8*12(%rsp)
mov %rbp,8*13(%rsp)
mov %rbx,8*14(%rsp)
.Lbody_mul_2x2:
mov $rp,32(%rsp) # save the arguments
mov $a1,40(%rsp)
mov $a0,48(%rsp)
mov $b1,56(%rsp)
mov $b0,64(%rsp)
mov \$0xf,$mask
mov $a1,$a
mov $b1,$b
call _mul_1x1 # a1·b1
mov $lo,16(%rsp)
mov $hi,24(%rsp)
mov 48(%rsp),$a
mov 64(%rsp),$b
call _mul_1x1 # a0·b0
mov $lo,0(%rsp)
mov $hi,8(%rsp)
mov 40(%rsp),$a
mov 56(%rsp),$b
xor 48(%rsp),$a
xor 64(%rsp),$b
call _mul_1x1 # (a0+a1)·(b0+b1)
___
@r=("%rbx","%rcx","%rdi","%rsi");
$code.=<<___;
mov 0(%rsp),@r[0]
mov 8(%rsp),@r[1]
mov 16(%rsp),@r[2]
mov 24(%rsp),@r[3]
mov 32(%rsp),%rbp
xor $hi,$lo
xor @r[1],$hi
xor @r[0],$lo
mov @r[0],0(%rbp)
xor @r[2],$hi
mov @r[3],24(%rbp)
xor @r[3],$lo
xor @r[3],$hi
xor $hi,$lo
mov $hi,16(%rbp)
mov $lo,8(%rbp)
mov 8*10(%rsp),%r14
mov 8*11(%rsp),%r13
mov 8*12(%rsp),%r12
mov 8*13(%rsp),%rbp
mov 8*14(%rsp),%rbx
___
$code.=<<___ if ($win64);
mov 8*15(%rsp),%rdi
mov 8*16(%rsp),%rsi
___
$code.=<<___;
lea 8*17(%rsp),%rsp
ret
.Lend_mul_2x2:
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 16
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
mov 248($context),%rbx # pull context->Rip
lea .Lbody_mul_2x2(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lin_prologue
mov 8*10(%rax),%r14 # mimic epilogue
mov 8*11(%rax),%r13
mov 8*12(%rax),%r12
mov 8*13(%rax),%rbp
mov 8*14(%rax),%rbx
mov 8*15(%rax),%rdi
mov 8*16(%rax),%rsi
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
.Lin_prologue:
lea 8*17(%rax),%rax
mov %rax,152($context) # restore context->Rsp
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
.section .pdata
.align 4
.rva _mul_1x1
.rva .Lend_mul_1x1
.rva .LSEH_info_1x1
.rva .Lvanilla_mul_2x2
.rva .Lend_mul_2x2
.rva .LSEH_info_2x2
.section .xdata
.align 8
.LSEH_info_1x1:
.byte 0x01,0x07,0x02,0x00
.byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
.LSEH_info_2x2:
.byte 9,0,0,0
.rva se_handler
___
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff