Initial Commit

This commit is contained in:
root
2017-02-25 23:55:24 +01:00
commit 1fe2e8ab62
4868 changed files with 1487355 additions and 0 deletions

View File

@@ -0,0 +1 @@
C2.pl works

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,322 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for Alpha.
# On 21264 performance is 33% better than code generated by vendor
# compiler, and 75% better than GCC [3.4], and in absolute terms is
# 8.7 cycles per processed byte. Implementation features vectorized
# byte swap, but not Xupdate.
@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
"\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
$ctx="a0"; # $16
$inp="a1";
$num="a2";
$A="a3";
$B="a4"; # 20
$C="a5";
$D="t8";
$E="t9"; @V=($A,$B,$C,$D,$E);
$t0="t10"; # 24
$t1="t11";
$t2="ra";
$t3="t12";
$K="AT"; # 28
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i==0);
ldq_u @X[0],0+0($inp)
ldq_u @X[1],0+7($inp)
___
$code.=<<___ if (!($i&1) && $i<14);
ldq_u @X[$i+2],($i+2)*4+0($inp)
ldq_u @X[$i+3],($i+2)*4+7($inp)
___
$code.=<<___ if (!($i&1) && $i<15);
extql @X[$i],$inp,@X[$i]
extqh @X[$i+1],$inp,@X[$i+1]
or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
srl @X[$i],24,$t0 # vectorized byte swap
srl @X[$i],8,$t2
sll @X[$i],8,$t3
sll @X[$i],24,@X[$i]
zapnot $t0,0x11,$t0
zapnot $t2,0x22,$t2
zapnot @X[$i],0x88,@X[$i]
or $t0,$t2,$t0
zapnot $t3,0x44,$t3
sll $a,5,$t1
or @X[$i],$t0,@X[$i]
addl $K,$e,$e
and $b,$c,$t2
zapnot $a,0xf,$a
or @X[$i],$t3,@X[$i]
srl $a,27,$t0
bic $d,$b,$t3
sll $b,30,$b
extll @X[$i],4,@X[$i+1] # extract upper half
or $t2,$t3,$t2
addl @X[$i],$e,$e
addl $t1,$e,$e
srl $b,32,$t3
zapnot @X[$i],0xf,@X[$i]
addl $t0,$e,$e
addl $t2,$e,$e
or $t3,$b,$b
___
$code.=<<___ if (($i&1) && $i<15);
sll $a,5,$t1
addl $K,$e,$e
and $b,$c,$t2
zapnot $a,0xf,$a
srl $a,27,$t0
addl @X[$i%16],$e,$e
bic $d,$b,$t3
sll $b,30,$b
or $t2,$t3,$t2
addl $t1,$e,$e
srl $b,32,$t3
zapnot @X[$i],0xf,@X[$i]
addl $t0,$e,$e
addl $t2,$e,$e
or $t3,$b,$b
___
$code.=<<___ if ($i>=15); # with forward Xupdate
sll $a,5,$t1
addl $K,$e,$e
and $b,$c,$t2
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
zapnot $a,0xf,$a
addl @X[$i%16],$e,$e
bic $d,$b,$t3
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
srl $a,27,$t0
addl $t1,$e,$e
or $t2,$t3,$t2
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
sll $b,30,$b
addl $t0,$e,$e
srl @X[$j%16],31,$t1
addl $t2,$e,$e
srl $b,32,$t3
addl @X[$j%16],@X[$j%16],@X[$j%16]
or $t3,$b,$b
zapnot @X[$i%16],0xf,@X[$i%16]
or $t1,@X[$j%16],@X[$j%16]
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79); # with forward Xupdate
sll $a,5,$t1
addl $K,$e,$e
zapnot $a,0xf,$a
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
sll $b,30,$t3
addl $t1,$e,$e
xor $b,$c,$t2
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
srl $b,2,$b
addl @X[$i%16],$e,$e
xor $d,$t2,$t2
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
srl @X[$j%16],31,$t1
addl $t2,$e,$e
srl $a,27,$t0
addl @X[$j%16],@X[$j%16],@X[$j%16]
or $t3,$b,$b
addl $t0,$e,$e
or $t1,@X[$j%16],@X[$j%16]
___
$code.=<<___ if ($i<77);
zapnot @X[$i%16],0xf,@X[$i%16]
___
$code.=<<___ if ($i==79); # with context fetch
sll $a,5,$t1
addl $K,$e,$e
zapnot $a,0xf,$a
ldl @X[0],0($ctx)
sll $b,30,$t3
addl $t1,$e,$e
xor $b,$c,$t2
ldl @X[1],4($ctx)
srl $b,2,$b
addl @X[$i%16],$e,$e
xor $d,$t2,$t2
ldl @X[2],8($ctx)
srl $a,27,$t0
addl $t2,$e,$e
ldl @X[3],12($ctx)
or $t3,$b,$b
addl $t0,$e,$e
ldl @X[4],16($ctx)
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___; # with forward Xupdate
sll $a,5,$t1
addl $K,$e,$e
zapnot $a,0xf,$a
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
srl $a,27,$t0
and $b,$c,$t2
and $b,$d,$t3
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
sll $b,30,$b
addl $t1,$e,$e
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
srl @X[$j%16],31,$t1
addl $t0,$e,$e
or $t2,$t3,$t2
and $c,$d,$t3
or $t2,$t3,$t2
srl $b,32,$t3
addl @X[$i%16],$e,$e
addl @X[$j%16],@X[$j%16],@X[$j%16]
or $t3,$b,$b
addl $t2,$e,$e
or $t1,@X[$j%16],@X[$j%16]
zapnot @X[$i%16],0xf,@X[$i%16]
___
}
$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl sha1_block_data_order
.align 5
.ent sha1_block_data_order
sha1_block_data_order:
lda sp,-64(sp)
stq ra,0(sp)
stq s0,8(sp)
stq s1,16(sp)
stq s2,24(sp)
stq s3,32(sp)
stq s4,40(sp)
stq s5,48(sp)
stq fp,56(sp)
.mask 0x0400fe00,-64
.frame sp,64,ra
.prologue 0
ldl $A,0($ctx)
ldl $B,4($ctx)
sll $num,6,$num
ldl $C,8($ctx)
ldl $D,12($ctx)
ldl $E,16($ctx)
addq $inp,$num,$num
.Lloop:
.set noreorder
ldah $K,23170(zero)
zapnot $B,0xf,$B
lda $K,31129($K) # K_00_19
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldah $K,28378(zero)
lda $K,-5215($K) # K_20_39
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldah $K,-28900(zero)
lda $K,-17188($K) # K_40_59
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldah $K,-13725(zero)
lda $K,-15914($K) # K_60_79
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
addl @X[0],$A,$A
addl @X[1],$B,$B
addl @X[2],$C,$C
addl @X[3],$D,$D
addl @X[4],$E,$E
stl $A,0($ctx)
stl $B,4($ctx)
addq $inp,64,$inp
stl $C,8($ctx)
stl $D,12($ctx)
stl $E,16($ctx)
cmpult $inp,$num,$t1
bne $t1,.Lloop
.set noreorder
ldq ra,0(sp)
ldq s0,8(sp)
ldq s1,16(sp)
ldq s2,24(sp)
ldq s3,32(sp)
ldq s4,40(sp)
ldq s5,48(sp)
ldq fp,56(sp)
lda sp,64(sp)
ret (ra)
.end sha1_block_data_order
.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;

View File

@@ -0,0 +1,683 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# sha1_block procedure for ARMv4.
#
# January 2007.
# Size/performance trade-off
# ====================================================================
# impl size in bytes comp cycles[*] measured performance
# ====================================================================
# thumb 304 3212 4420
# armv4-small 392/+29% 1958/+64% 2250/+96%
# armv4-compact 740/+89% 1552/+26% 1840/+22%
# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
# ====================================================================
# thumb = same as 'small' but in Thumb instructions[**] and
# with recurring code in two private functions;
# small = detached Xload/update, loops are folded;
# compact = detached Xload/update, 5x unroll;
# large = interleaved Xload/update, 5x unroll;
# full unroll = interleaved Xload/update, full unroll, estimated[!];
#
# [*] Manually counted instructions in "grand" loop body. Measured
# performance is affected by prologue and epilogue overhead,
# i-cache availability, branch penalties, etc.
# [**] While each Thumb instruction is twice smaller, they are not as
# diverse as ARM ones: e.g., there are only two arithmetic
# instructions with 3 arguments, no [fixed] rotate, addressing
# modes are limited. As result it takes more instructions to do
# the same job in Thumb, therefore the code is never twice as
# small and always slower.
# [***] which is also ~35% better than compiler generated code. Dual-
# issue Cortex A8 core was measured to process input block in
# ~990 cycles.
# August 2010.
#
# Rescheduling for dual-issue pipeline resulted in 13% improvement on
# Cortex A8 core and in absolute terms ~870 cycles per input block
# [or 13.6 cycles per byte].
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
# September 2013.
#
# Add NEON implementation (see sha1-586.pl for background info). On
# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
# faster than integer-only code. Because [fully unrolled] NEON code
# is ~2.5x larger and there are some redundant instructions executed
# when processing last block, improvement is not as big for smallest
# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
# byte, which is also >80% faster than integer-only code.
# May 2014.
#
# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0";
$inp="r1";
$len="r2";
$a="r3";
$b="r4";
$c="r5";
$d="r6";
$e="r7";
$K="r8";
$t0="r9";
$t1="r10";
$t2="r11";
$t3="r12";
$Xi="r14";
@V=($a,$b,$c,$d,$e);
sub Xupdate {
my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
$code.=<<___;
ldr $t0,[$Xi,#15*4]
ldr $t1,[$Xi,#13*4]
ldr $t2,[$Xi,#7*4]
add $e,$K,$e,ror#2 @ E+=K_xx_xx
ldr $t3,[$Xi,#2*4]
eor $t0,$t0,$t1
eor $t2,$t2,$t3 @ 1 cycle stall
eor $t1,$c,$d @ F_xx_xx
mov $t0,$t0,ror#31
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
eor $t0,$t0,$t2,ror#31
str $t0,[$Xi,#-4]!
$opt1 @ F_xx_xx
$opt2 @ F_xx_xx
add $e,$e,$t0 @ E+=X[i]
___
}
sub BODY_00_15 {
my ($a,$b,$c,$d,$e)=@_;
$code.=<<___;
#if __ARM_ARCH__<7
ldrb $t1,[$inp,#2]
ldrb $t0,[$inp,#3]
ldrb $t2,[$inp,#1]
add $e,$K,$e,ror#2 @ E+=K_00_19
ldrb $t3,[$inp],#4
orr $t0,$t0,$t1,lsl#8
eor $t1,$c,$d @ F_xx_xx
orr $t0,$t0,$t2,lsl#16
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t0,$t3,lsl#24
#else
ldr $t0,[$inp],#4 @ handles unaligned
add $e,$K,$e,ror#2 @ E+=K_00_19
eor $t1,$c,$d @ F_xx_xx
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev $t0,$t0 @ byte swap
#endif
#endif
and $t1,$b,$t1,ror#2
add $e,$e,$t0 @ E+=X[i]
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
str $t0,[$Xi,#-4]!
add $e,$e,$t1 @ E+=F_00_19(B,C,D)
___
}
sub BODY_16_19 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"and $t1,$b,$t1,ror#2");
$code.=<<___;
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
add $e,$e,$t1 @ E+=F_00_19(B,C,D)
___
}
sub BODY_20_39 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"eor $t1,$b,$t1,ror#2");
$code.=<<___;
add $e,$e,$t1 @ E+=F_20_39(B,C,D)
___
}
sub BODY_40_59 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
$code.=<<___;
add $e,$e,$t1 @ E+=F_40_59(B,C,D)
add $e,$e,$t2,ror#2
___
}
$code=<<___;
#include "arm_arch.h"
.text
.code 32
.global sha1_block_data_order
.type sha1_block_data_order,%function
.align 5
sha1_block_data_order:
#if __ARM_MAX_ARCH__>=7
sub r3,pc,#8 @ sha1_block_data_order
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#ARMV8_SHA1
bne .LARMv8
tst r12,#ARMV7_NEON
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
ldmia $ctx,{$a,$b,$c,$d,$e}
.Lloop:
ldr $K,.LK_00_19
mov $Xi,sp
sub sp,sp,#15*4
mov $c,$c,ror#30
mov $d,$d,ror#30
mov $e,$e,ror#30 @ [6]
.L_00_15:
___
for($i=0;$i<5;$i++) {
&BODY_00_15(@V); unshift(@V,pop(@V));
}
$code.=<<___;
teq $Xi,sp
bne .L_00_15 @ [((11+4)*5+2)*3]
sub sp,sp,#25*4
___
&BODY_00_15(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
$code.=<<___;
ldr $K,.LK_20_39 @ [+15+16*4]
cmn sp,#0 @ [+3], clear carry to denote 20_39
.L_20_39_or_60_79:
___
for($i=0;$i<5;$i++) {
&BODY_20_39(@V); unshift(@V,pop(@V));
}
$code.=<<___;
teq $Xi,sp @ preserve carry
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
ldr $K,.LK_40_59
sub sp,sp,#20*4 @ [+2]
.L_40_59:
___
for($i=0;$i<5;$i++) {
&BODY_40_59(@V); unshift(@V,pop(@V));
}
$code.=<<___;
teq $Xi,sp
bne .L_40_59 @ [+((12+5)*5+2)*4]
ldr $K,.LK_60_79
sub sp,sp,#20*4
cmp sp,#0 @ set carry to denote 60_79
b .L_20_39_or_60_79 @ [+4], spare 300 bytes
.L_done:
add sp,sp,#80*4 @ "deallocate" stack frame
ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
add $a,$K,$a
add $b,$t0,$b
add $c,$t1,$c,ror#2
add $d,$t2,$d,ror#2
add $e,$t3,$e,ror#2
stmia $ctx,{$a,$b,$c,$d,$e}
teq $inp,$len
bne .Lloop @ [+18], total 1307
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha1_block_data_order,.-sha1_block_data_order
.align 5
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha1_block_data_order
#endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
#####################################################################
# NEON stuff
#
{{{
my @V=($a,$b,$c,$d,$e);
my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
my $Xi=4;
my @X=map("q$_",(8..11,0..3));
my @Tx=("q12","q13");
my ($K,$zero)=("q14","q15");
my $j=0;
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub body_00_19 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&bic ($t0,$d,$b)',
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&and ($t1,$c,$b)',
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&eor ($t1,$t1,$t0)', # F_00_19
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_00_19
'$j++; unshift(@V,pop(@V));'
)
}
sub body_20_39 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&eor ($t0,$b,$d)',
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
'&eor ($t1,$t0,$c)', # F_20_39
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_20_39
'$j++; unshift(@V,pop(@V));'
)
}
sub body_40_59 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&and ($t0,$c,$d)',
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&eor ($t1,$c,$d)',
'&add ($e,$e,$t0)',
'&and ($t1,$t1,$b)',
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_40_59
'$j++; unshift(@V,pop(@V));'
)
}
sub Xupdate_16_31 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
eval(shift(@insns));
&vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
&sub ($Xfer,$Xfer,64) if ($Xi%4==0);
eval(shift(@insns));
eval(shift(@insns));
&vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@Tx[0],@Tx[0]);
eval(shift(@insns));
eval(shift(@insns));
&vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 (@Tx[0],@Tx[1],30);
eval(shift(@insns));
eval(shift(@insns));
&vshl_u32 (@Tx[1],@Tx[1],2);
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@Tx[0]);
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
}
sub Xupdate_32_79 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 (@X[0],@Tx[0],30);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
&sub ($Xfer,$Xfer,64) if ($Xi%4==0);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
}
sub Xuplast_80 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
&sub ($Xfer,$Xfer,64);
&teq ($inp,$len);
&sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
&subeq ($inp,$inp,64); # reload last block to avoid SEGV
&vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
eval(shift(@insns));
eval(shift(@insns));
&vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
eval(shift(@insns));
eval(shift(@insns));
&vrev32_8 (@X[-4&7],@X[-4&7]);
foreach (@insns) { eval; } # remaining instructions
$Xi=0;
}
sub Xloop()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
foreach (@insns) { eval; }
$Xi++;
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type sha1_block_data_order_neon,%function
.align 4
sha1_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
@ dmb @ errata #451034 on early Cortex A8
@ vstmdb sp!,{d8-d15} @ ABI specification says so
mov $saved_sp,sp
sub sp,sp,#64 @ alloca
adr $K_XX_XX,.LK_00_19
bic sp,sp,#15 @ align for 128-bit stores
ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
mov $Xfer,sp
vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
veor $zero,$zero,$zero
vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
vrev32.8 @X[-2&7],@X[-2&7]
vadd.i32 @X[0],@X[-4&7],$K
vrev32.8 @X[-1&7],@X[-1&7]
vadd.i32 @X[1],@X[-3&7],$K
vst1.32 {@X[0]},[$Xfer,:128]!
vadd.i32 @X[2],@X[-2&7],$K
vst1.32 {@X[1]},[$Xfer,:128]!
vst1.32 {@X[2]},[$Xfer,:128]!
ldr $Ki,[sp] @ big RAW stall
.Loop_neon:
___
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_32_79(\&body_00_19);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_20_39);
&Xuplast_80(\&body_20_39);
&Xloop(\&body_20_39);
&Xloop(\&body_20_39);
&Xloop(\&body_20_39);
$code.=<<___;
ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
add $a,$a,$Ki
ldr $Ki,[$ctx,#16]
add $b,$b,$t0
add $c,$c,$t1
add $d,$d,$Xfer
moveq sp,$saved_sp
add $e,$e,$Ki
ldrne $Ki,[sp]
stmia $ctx,{$a,$b,$c,$d,$e}
addne $Xfer,sp,#3*16
bne .Loop_neon
@ vldmia sp!,{d8-d15}
ldmia sp!,{r4-r12,pc}
.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
#endif
___
}}}
#####################################################################
# ARMv8 stuff
#
{{{
my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
my @MSG=map("q$_",(4..7));
my @Kxx=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.type sha1_block_data_order_armv8,%function
.align 5
sha1_block_data_order_armv8:
.LARMv8:
vstmdb sp!,{d8-d15} @ ABI specification says so
veor $E,$E,$E
adr r3,.LK_00_19
vld1.32 {$ABCD},[$ctx]!
vld1.32 {$E\[0]},[$ctx]
sub $ctx,$ctx,#16
vld1.32 {@Kxx[0]\[]},[r3,:32]!
vld1.32 {@Kxx[1]\[]},[r3,:32]!
vld1.32 {@Kxx[2]\[]},[r3,:32]!
vld1.32 {@Kxx[3]\[]},[r3,:32]
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vadd.i32 $W0,@Kxx[0],@MSG[0]
vrev32.8 @MSG[2],@MSG[2]
vmov $ABCD_SAVE,$ABCD @ offload
subs $len,$len,#1
vadd.i32 $W1,@Kxx[0],@MSG[1]
vrev32.8 @MSG[3],@MSG[3]
sha1h $E1,$ABCD @ 0
sha1c $ABCD,$E,$W0
vadd.i32 $W0,@Kxx[$j],@MSG[2]
sha1su0 @MSG[0],@MSG[1],@MSG[2]
___
for ($j=0,$i=1;$i<20-3;$i++) {
my $f=("c","p","m","p")[$i/5];
$code.=<<___;
sha1h $E0,$ABCD @ $i
sha1$f $ABCD,$E1,$W1
vadd.i32 $W1,@Kxx[$j],@MSG[3]
sha1su1 @MSG[0],@MSG[3]
___
$code.=<<___ if ($i<20-4);
sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
}
$code.=<<___;
sha1h $E0,$ABCD @ $i
sha1p $ABCD,$E1,$W1
vadd.i32 $W1,@Kxx[$j],@MSG[3]
sha1h $E1,$ABCD @ 18
sha1p $ABCD,$E0,$W0
sha1h $E0,$ABCD @ 19
sha1p $ABCD,$E1,$W1
vadd.i32 $E,$E,$E0
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
bne .Loop_v8
vst1.32 {$ABCD},[$ctx]!
vst1.32 {$E\[0]},[$ctx]
vldmia sp!,{d8-d15}
ret @ bx lr
.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
#endif
___
}}}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
{ my %opcode = (
"sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
"sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
"sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
sub unsha1 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
}
foreach (split($/,$code)) {
s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
s/\bret\b/bx lr/o or
s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
print $_,$/;
}
close STDOUT; # enforce flush

View File

@@ -0,0 +1,338 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA1 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# hardware-assisted software(*)
# Apple A7 2.31 4.13 (+14%)
# Cortex-A53 2.24 8.03 (+97%)
# Cortex-A57 2.35 7.88 (+74%)
# Denver 2.13 3.97 (+0%)(**)
# X-Gene 8.80 (+200%)
#
# (*) Software results are presented mostly for reference purposes.
# (**) Keep in mind that Denver relies on binary translation, which
# optimizes compiler output at run-time.
$flavour = shift;
open STDOUT,">".shift;
($ctx,$inp,$num)=("x0","x1","x2");
@Xw=map("w$_",(3..17,19));
@Xx=map("x$_",(3..17,19));
@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
($t0,$t1,$t2,$K)=map("w$_",(25..28));
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i<15 && !($i&1));
lsr @Xx[$i+1],@Xx[$i],#32
___
$code.=<<___ if ($i<14 && !($i&1));
ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`]
___
$code.=<<___ if ($i<14 && ($i&1));
#ifdef __ARMEB__
ror @Xx[$i+1],@Xx[$i+1],#32
#else
rev32 @Xx[$i+1],@Xx[$i+1]
#endif
___
$code.=<<___ if ($i<14);
bic $t0,$d,$b
and $t1,$c,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
orr $t0,$t0,$t1
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
___
$code.=<<___ if ($i==19);
movz $K,#0xeba1
movk $K,#0x6ed9,lsl#16
___
$code.=<<___ if ($i>=14);
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
bic $t0,$d,$b
and $t1,$c,$b
ror $t2,$a,#27
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
add $d,$d,$K // future e+=K
orr $t0,$t0,$t1
add $e,$e,$t2 // e+=rot(a,5)
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i==59);
movz $K,#0xc1d6
movk $K,#0xca62,lsl#16
___
$code.=<<___;
orr $t0,$b,$c
and $t1,$b,$c
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
ror $t2,$a,#27
and $t0,$t0,$d
add $d,$d,$K // future e+=K
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
add $e,$e,$t2 // e+=rot(a,5)
orr $t0,$t0,$t1
ror $b,$b,#2
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i==39);
movz $K,#0xbcdc
movk $K,#0x8f1b,lsl#16
___
$code.=<<___ if ($i<78);
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
eor $t0,$d,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
$code.=<<___ if ($i==78);
ldp @Xw[1],@Xw[2],[$ctx]
eor $t0,$d,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
___
$code.=<<___ if ($i==79);
ldp @Xw[3],@Xw[4],[$ctx,#8]
eor $t0,$d,$b
ror $t2,$a,#27
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
ldr @Xw[5],[$ctx,#16]
add $e,$e,$t0 // e+=F(b,c,d)
___
}
$code.=<<___;
#include "arm_arch.h"
.text
.globl sha1_block_data_order
.type sha1_block_data_order,%function
.align 6
sha1_block_data_order:
ldr x16,.LOPENSSL_armcap_P
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
tst w16,#ARMV8_SHA1
b.ne .Lv8_entry
stp x29,x30,[sp,#-96]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
ldp $A,$B,[$ctx]
ldp $C,$D,[$ctx,#8]
ldr $E,[$ctx,#16]
.Loop:
ldr @Xx[0],[$inp],#64
movz $K,#0x7999
sub $num,$num,#1
movk $K,#0x5a82,lsl#16
#ifdef __ARMEB__
ror $Xx[0],@Xx[0],#32
#else
rev32 @Xx[0],@Xx[0]
#endif
add $E,$E,$K // warm it up
add $E,$E,@Xw[0]
___
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $B,$B,@Xw[2]
add $C,$C,@Xw[3]
add $A,$A,@Xw[1]
add $D,$D,@Xw[4]
add $E,$E,@Xw[5]
stp $A,$B,[$ctx]
stp $C,$D,[$ctx,#8]
str $E,[$ctx,#16]
cbnz $num,.Loop
ldp x19,x20,[sp,#16]
ldp x21,x22,[sp,#32]
ldp x23,x24,[sp,#48]
ldp x25,x26,[sp,#64]
ldp x27,x28,[sp,#80]
ldr x29,[sp],#96
ret
.size sha1_block_data_order,.-sha1_block_data_order
___
{{{
my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
my @MSG=map("v$_.16b",(4..7));
my @Kxx=map("v$_.4s",(16..19));
my ($W0,$W1)=("v20.4s","v21.4s");
my $ABCD_SAVE="v22.16b";
$code.=<<___;
.type sha1_block_armv8,%function
.align 6
sha1_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
adr x4,.Lconst
eor $E,$E,$E
ld1.32 {$ABCD},[$ctx],#16
ld1.32 {$E}[0],[$ctx]
sub $ctx,$ctx,#16
ld1.32 {@Kxx[0]-@Kxx[3]},[x4]
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
add.i32 $W0,@Kxx[0],@MSG[0]
rev32 @MSG[2],@MSG[2]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
add.i32 $W1,@Kxx[0],@MSG[1]
rev32 @MSG[3],@MSG[3]
sha1h $E1,$ABCD
sha1c $ABCD,$E,$W0 // 0
add.i32 $W0,@Kxx[$j],@MSG[2]
sha1su0 @MSG[0],@MSG[1],@MSG[2]
___
for ($j=0,$i=1;$i<20-3;$i++) {
my $f=("c","p","m","p")[$i/5];
$code.=<<___;
sha1h $E0,$ABCD // $i
sha1$f $ABCD,$E1,$W1
add.i32 $W1,@Kxx[$j],@MSG[3]
sha1su1 @MSG[0],@MSG[3]
___
$code.=<<___ if ($i<20-4);
sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
}
$code.=<<___;
sha1h $E0,$ABCD // $i
sha1p $ABCD,$E1,$W1
add.i32 $W1,@Kxx[$j],@MSG[3]
sha1h $E1,$ABCD // 18
sha1p $ABCD,$E0,$W0
sha1h $E0,$ABCD // 19
sha1p $ABCD,$E1,$W1
add.i32 $E,$E,$E0
add.i32 $ABCD,$ABCD,$ABCD_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD},[$ctx],#16
st1.32 {$E}[0],[$ctx]
ldr x29,[sp],#16
ret
.size sha1_block_armv8,.-sha1_block_armv8
.align 6
.Lconst:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
.LOPENSSL_armcap_P:
.quad OPENSSL_armcap_P-.
.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.comm OPENSSL_armcap_P,4,4
___
}}}
{ my %opcode = (
"sha1c" => 0x5e000000, "sha1p" => 0x5e001000,
"sha1m" => 0x5e002000, "sha1su0" => 0x5e003000,
"sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 );
sub unsha1 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
s/\.\w?32\b//o and s/\.16b/\.4s/go;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,305 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Eternal question is what's wrong with compiler generated code? The
# trick is that it's possible to reduce the number of shifts required
# to perform rotations by maintaining copy of 32-bit value in upper
# bits of 64-bit register. Just follow mux2 and shrp instructions...
# Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
# is >50% better than HP C and >2x better than gcc.
$code=<<___;
.ident \"sha1-ia64.s, version 1.3\"
.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
.explicit
___
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
#$human=1;
if ($human) { # useful for visual code auditing...
($A,$B,$C,$D,$E) = ("A","B","C","D","E");
($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
( "K_00_19","K_20_39","K_40_59","K_60_79" );
@X= ( "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7",
"X8", "X9","X10","X11","X12","X13","X14","X15" );
}
else {
($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4");
($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
( "r14", "r15", "loc10", "loc11" );
@X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" );
}
sub BODY_00_15 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $Xn=@X[$j%16];
$code.=<<___ if ($i==0);
{ .mmi; ld1 $X[$i]=[inp],2 // MSB
ld1 tmp2=[tmp3],2 };;
{ .mmi; ld1 tmp0=[inp],2
ld1 tmp4=[tmp3],2 // LSB
dep $X[$i]=$X[$i],tmp2,8,8 };;
___
if ($i<15) {
$code.=<<___;
{ .mmi; ld1 $Xn=[inp],2 // forward Xload
nop.m 0x0
dep tmp1=tmp0,tmp4,8,8 };;
{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload
and tmp4=$c,$b
dep $X[$i]=$X[$i],tmp1,16,16} //;;
{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
andcm tmp1=$d,$b
dep.z tmp5=$a,5,27 };; // a<<5
{ .mmi; add $e=$e,$X[$i] // e+=Xload
or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 };; // a>>27
{ .mmi; ld1 tmp0=[inp],2 // forward Xload
add $e=$e,tmp4 // e+=F_00_19(b,c,d)
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload
or tmp5=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)
dep $Xn=$Xn,tmp2,8,8 // forward Xload
mux2 $X[$i]=$X[$i],0x44 } //;;
___
}
else {
$code.=<<___;
{ .mii; and tmp3=$c,$b
dep tmp1=tmp0,tmp4,8,8;;
dep $X[$i]=$X[$i],tmp1,16,16} //;;
{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
andcm tmp1=$d,$b
dep.z tmp5=$a,5,27 };; // a<<5
{ .mmi; add $e=$e,$X[$i] // e+=Xupdate
or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 } // a>>27
{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
nop.i 0 };;
{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d)
xor $Xn=$Xn,tmp3 // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
mux2 $X[$i]=$X[$i],0x44 };;
___
}
}
sub BODY_16_19 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $Xn=@X[$j%16];
$code.=<<___;
{ .mib; add $e=$e,$K_00_19 // e+=K_00_19
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; andcm tmp1=$d,$b
and tmp0=$c,$b };;
{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 } // a>>27
{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
nop.i 0 };;
{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d)
xor $Xn=$Xn,tmp3 // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0 };;
___
}
sub BODY_20_39 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e,$Konst)=@_;
$Konst = $K_20_39 if (!defined($Konst));
my $j=$i+1;
my $Xn=@X[$j%16];
if ($i<79) {
$code.=<<___;
{ .mib; add $e=$e,$Konst // e+=K_XX_XX
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; xor tmp0=$c,$b
xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate
{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
extr.u tmp1=$a,27,5 } // a>>27
{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate
{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0 };;
___
}
else {
$code.=<<___;
{ .mib; add $e=$e,$Konst // e+=K_60_79
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; xor tmp0=$c,$b
add $h1=$h1,$a };; // wrap up
{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
extr.u tmp1=$a,27,5 } // a>>27
{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
add $h3=$h3,$c };; // wrap up
{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
or tmp1=tmp1,tmp5 // ROTATE(a,5)
shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;?
{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5)
add tmp3=1,inp // used in unaligned codepath
add $h4=$h4,$d };; // wrap up
___
}
}
sub BODY_40_59 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $Xn=@X[$j%16];
$code.=<<___;
{ .mib; add $e=$e,$K_40_59 // e+=K_40_59
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; and tmp1=$c,$d
xor tmp0=$c,$d };;
{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
add tmp5=tmp5,tmp1 // a<<5+(c&d)
extr.u tmp1=$a,27,5 } // a>>27
{ .mmi; and tmp0=tmp0,$b
xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate
{ .mmi; add $e=$e,tmp0 // e+=b&(c^d)
add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d)
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; xor $Xn=$Xn,tmp3
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0x0 };;
___
}
sub BODY_60_79 { &BODY_20_39(@_,$K_60_79); }
$code.=<<___;
.text
tmp0=r8;
tmp1=r9;
tmp2=r10;
tmp3=r11;
ctx=r32; // in0
inp=r33; // in1
// void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num);
.global sha1_block_data_order#
.proc sha1_block_data_order#
.align 32
sha1_block_data_order:
.prologue
{ .mmi; alloc tmp1=ar.pfs,3,14,0,0
$ADDP tmp0=4,ctx
.save ar.lc,r3
mov r3=ar.lc }
{ .mmi; $ADDP ctx=0,ctx
$ADDP inp=0,inp
mov r2=pr };;
tmp4=in2;
tmp5=loc12;
tmp6=loc13;
.body
{ .mlx; ld4 $h0=[ctx],8
movl $K_00_19=0x5a827999 }
{ .mlx; ld4 $h1=[tmp0],8
movl $K_20_39=0x6ed9eba1 };;
{ .mlx; ld4 $h2=[ctx],8
movl $K_40_59=0x8f1bbcdc }
{ .mlx; ld4 $h3=[tmp0]
movl $K_60_79=0xca62c1d6 };;
{ .mmi; ld4 $h4=[ctx],-16
add in2=-1,in2 // adjust num for ar.lc
mov ar.ec=1 };;
{ .mmi; nop.m 0
add tmp3=1,inp
mov ar.lc=in2 };; // brp.loop.imp: too far
.Ldtop:
{ .mmi; mov $A=$h0
mov $B=$h1
mux2 tmp6=$h1,0x44 }
{ .mmi; mov $C=$h2
mov $D=$h3
mov $E=$h4 };;
___
{ my $i;
my @V=($A,$B,$C,$D,$E);
for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<40;$i++) { &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
(($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check
}
$code.=<<___;
{ .mmb; add $h0=$h0,$A
add $h2=$h2,$C
br.ctop.dptk.many .Ldtop };;
.Ldend:
{ .mmi; add tmp0=4,ctx
mov ar.lc=r3 };;
{ .mmi; st4 [ctx]=$h0,8
st4 [tmp0]=$h1,8 };;
{ .mmi; st4 [ctx]=$h2,8
st4 [tmp0]=$h3 };;
{ .mib; st4 [ctx]=$h4,-16
mov pr=r2,0x1ffff
br.ret.sptk.many b0 };;
.endp sha1_block_data_order#
stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___
$output=shift and open STDOUT,">$output";
print $code;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,450 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for MIPS.
# Performance improvement is 30% on unaligned input. The "secret" is
# to deploy lwl/lwr pair to load unaligned input. One could have
# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
# compatible subroutine. There is room for minor optimization on
# little-endian platforms...
# September 2012.
#
# Add MIPS32r2 code (>25% less instructions).
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp;
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
$PTR_SUB="dsub"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$PTR_SLL="dsll"; # incidentally works even on n32
$SZREG=8;
} else {
$PTR_ADD="add";
$PTR_SUB="sub";
$REG_S="sw";
$REG_L="lw";
$PTR_SLL="sll";
$SZREG=4;
}
#
# <appro@openssl.org>
#
######################################################################
$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC});
for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
if (!defined($big_endian))
{ $big_endian=(unpack('L',pack('N',1))==1); }
# offsets of the Most and Least Significant Bytes
$MSB=$big_endian?0:3;
$LSB=3&~$MSB;
@X=map("\$$_",(8..23)); # a4-a7,s0-s11
$ctx=$a0;
$inp=$a1;
$num=$a2;
$A="\$1";
$B="\$2";
$C="\$3";
$D="\$7";
$E="\$24"; @V=($A,$B,$C,$D,$E);
$t0="\$25";
$t1=$num; # $num is offloaded to stack
$t2="\$30"; # fp
$K="\$31"; # ra
sub BODY_00_14 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if (!$big_endian);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
wsbh @X[$i],@X[$i] # byte swap($i)
rotr @X[$i],@X[$i],16
#else
srl $t0,@X[$i],24 # byte swap($i)
srl $t1,@X[$i],8
andi $t2,@X[$i],0xFF00
sll @X[$i],@X[$i],24
andi $t1,0xFF00
sll $t2,$t2,8
or @X[$i],$t0
or $t1,$t2
or @X[$i],$t1
#endif
___
$code.=<<___;
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
addu $e,$K # $i
xor $t0,$c,$d
rotr $t1,$a,27
lwl @X[$j],$j*4+$MSB($inp)
and $t0,$b
addu $e,$t1
lwr @X[$j],$j*4+$LSB($inp)
xor $t0,$d
addu $e,@X[$i]
rotr $b,$b,2
addu $e,$t0
#else
lwl @X[$j],$j*4+$MSB($inp)
sll $t0,$a,5 # $i
addu $e,$K
lwr @X[$j],$j*4+$LSB($inp)
srl $t1,$a,27
addu $e,$t0
xor $t0,$c,$d
addu $e,$t1
sll $t2,$b,30
and $t0,$b
srl $b,$b,2
xor $t0,$d
addu $e,@X[$i]
or $b,$t2
addu $e,$t0
#endif
___
}
sub BODY_15_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if (!$big_endian && $i==15);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
wsbh @X[$i],@X[$i] # byte swap($i)
rotr @X[$i],@X[$i],16
#else
srl $t0,@X[$i],24 # byte swap($i)
srl $t1,@X[$i],8
andi $t2,@X[$i],0xFF00
sll @X[$i],@X[$i],24
andi $t1,0xFF00
sll $t2,$t2,8
or @X[$i],$t0
or @X[$i],$t1
or @X[$i],$t2
#endif
___
$code.=<<___;
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
addu $e,$K # $i
xor @X[$j%16],@X[($j+2)%16]
xor $t0,$c,$d
rotr $t1,$a,27
xor @X[$j%16],@X[($j+8)%16]
and $t0,$b
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
xor $t0,$d
addu $e,@X[$i%16]
rotr @X[$j%16],@X[$j%16],31
rotr $b,$b,2
addu $e,$t0
#else
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
srl $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
xor $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
sll $t2,$b,30
and $t0,$b
srl $t1,@X[$j%16],31
addu @X[$j%16],@X[$j%16]
srl $b,$b,2
xor $t0,$d
or @X[$j%16],$t1
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
#endif
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
xor @X[$j%16],@X[($j+2)%16]
addu $e,$K # $i
rotr $t1,$a,27
xor @X[$j%16],@X[($j+8)%16]
xor $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
xor $t0,$b
addu $e,@X[$i%16]
rotr @X[$j%16],@X[$j%16],31
rotr $b,$b,2
addu $e,$t0
#else
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
srl $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
xor $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
sll $t2,$b,30
xor $t0,$b
srl $t1,@X[$j%16],31
addu @X[$j%16],@X[$j%16]
srl $b,$b,2
addu $e,@X[$i%16]
or @X[$j%16],$t1
or $b,$t2
addu $e,$t0
#endif
___
$code.=<<___ if ($i==79);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
lw @X[0],0($ctx)
addu $e,$K # $i
lw @X[1],4($ctx)
rotr $t1,$a,27
lw @X[2],8($ctx)
xor $t0,$c,$d
addu $e,$t1
lw @X[3],12($ctx)
xor $t0,$b
addu $e,@X[$i%16]
lw @X[4],16($ctx)
rotr $b,$b,2
addu $e,$t0
#else
lw @X[0],0($ctx)
sll $t0,$a,5 # $i
addu $e,$K
lw @X[1],4($ctx)
srl $t1,$a,27
addu $e,$t0
lw @X[2],8($ctx)
xor $t0,$c,$d
addu $e,$t1
lw @X[3],12($ctx)
sll $t2,$b,30
xor $t0,$b
lw @X[4],16($ctx)
srl $b,$b,2
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
#endif
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
addu $e,$K # $i
and $t0,$c,$d
xor @X[$j%16],@X[($j+2)%16]
rotr $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
xor $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
and $t0,$b
addu $e,@X[$i%16]
rotr @X[$j%16],@X[$j%16],31
rotr $b,$b,2
addu $e,$t0
#else
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
srl $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
and $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
sll $t2,$b,30
addu $e,$t0
srl $t1,@X[$j%16],31
xor $t0,$c,$d
addu @X[$j%16],@X[$j%16]
and $t0,$b
srl $b,$b,2
or @X[$j%16],$t1
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
#endif
___
}
$FRAMESIZE=16; # large enough to accomodate NUBI saved registers
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
$code=<<___;
#ifdef OPENSSL_FIPSCANISTER
# include <openssl/fipssyms.h>
#endif
#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2)
#define _MIPS_ARCH_MIPS32R2
#endif
.text
.set noat
.set noreorder
.align 5
.globl sha1_block_data_order
.ent sha1_block_data_order
sha1_block_data_order:
.frame $sp,$FRAMESIZE*$SZREG,$ra
.mask $SAVED_REGS_MASK,-$SZREG
.set noreorder
$PTR_SUB $sp,$FRAMESIZE*$SZREG
$REG_S $ra,($FRAMESIZE-1)*$SZREG($sp)
$REG_S $fp,($FRAMESIZE-2)*$SZREG($sp)
$REG_S $s11,($FRAMESIZE-3)*$SZREG($sp)
$REG_S $s10,($FRAMESIZE-4)*$SZREG($sp)
$REG_S $s9,($FRAMESIZE-5)*$SZREG($sp)
$REG_S $s8,($FRAMESIZE-6)*$SZREG($sp)
$REG_S $s7,($FRAMESIZE-7)*$SZREG($sp)
$REG_S $s6,($FRAMESIZE-8)*$SZREG($sp)
$REG_S $s5,($FRAMESIZE-9)*$SZREG($sp)
$REG_S $s4,($FRAMESIZE-10)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
$REG_S $s3,($FRAMESIZE-11)*$SZREG($sp)
$REG_S $s2,($FRAMESIZE-12)*$SZREG($sp)
$REG_S $s1,($FRAMESIZE-13)*$SZREG($sp)
$REG_S $s0,($FRAMESIZE-14)*$SZREG($sp)
$REG_S $gp,($FRAMESIZE-15)*$SZREG($sp)
___
$code.=<<___;
$PTR_SLL $num,6
$PTR_ADD $num,$inp
$REG_S $num,0($sp)
lw $A,0($ctx)
lw $B,4($ctx)
lw $C,8($ctx)
lw $D,12($ctx)
b .Loop
lw $E,16($ctx)
.align 4
.Loop:
.set reorder
lwl @X[0],$MSB($inp)
lui $K,0x5a82
lwr @X[0],$LSB($inp)
ori $K,0x7999 # K_00_19
___
for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
lui $K,0x6ed9
ori $K,0xeba1 # K_20_39
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
lui $K,0x8f1b
ori $K,0xbcdc # K_40_59
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
lui $K,0xca62
ori $K,0xc1d6 # K_60_79
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
$PTR_ADD $inp,64
$REG_L $num,0($sp)
addu $A,$X[0]
addu $B,$X[1]
sw $A,0($ctx)
addu $C,$X[2]
addu $D,$X[3]
sw $B,4($ctx)
addu $E,$X[4]
sw $C,8($ctx)
sw $D,12($ctx)
sw $E,16($ctx)
.set noreorder
bne $inp,$num,.Loop
nop
.set noreorder
$REG_L $ra,($FRAMESIZE-1)*$SZREG($sp)
$REG_L $fp,($FRAMESIZE-2)*$SZREG($sp)
$REG_L $s11,($FRAMESIZE-3)*$SZREG($sp)
$REG_L $s10,($FRAMESIZE-4)*$SZREG($sp)
$REG_L $s9,($FRAMESIZE-5)*$SZREG($sp)
$REG_L $s8,($FRAMESIZE-6)*$SZREG($sp)
$REG_L $s7,($FRAMESIZE-7)*$SZREG($sp)
$REG_L $s6,($FRAMESIZE-8)*$SZREG($sp)
$REG_L $s5,($FRAMESIZE-9)*$SZREG($sp)
$REG_L $s4,($FRAMESIZE-10)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,($FRAMESIZE-11)*$SZREG($sp)
$REG_L $s2,($FRAMESIZE-12)*$SZREG($sp)
$REG_L $s1,($FRAMESIZE-13)*$SZREG($sp)
$REG_L $s0,($FRAMESIZE-14)*$SZREG($sp)
$REG_L $gp,($FRAMESIZE-15)*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE*$SZREG
.end sha1_block_data_order
.rdata
.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
close STDOUT;

View File

@@ -0,0 +1,260 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for PA-RISC.
# June 2009.
#
# On PA-7100LC performance is >30% better than gcc 3.2 generated code
# for aligned input and >50% better for unaligned. Compared to vendor
# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
# few percent faster in 32-bit one (this for aligned input, data for
# unaligned input is not available).
#
# Special thanks to polarhome.com for providing HP-UX account.
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
} else {
$LEVEL ="1.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
}
$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
# [+ argument transfer]
$ctx="%r26"; # arg0
$inp="%r25"; # arg1
$num="%r24"; # arg2
$t0="%r28";
$t1="%r29";
$K="%r31";
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<15);
addl $K,$e,$e ; $i
shd $a,$a,27,$t1
addl @X[$i],$e,$e
and $c,$b,$t0
addl $t1,$e,$e
andcm $d,$b,$t1
shd $b,$b,2,$b
or $t1,$t0,$t0
addl $t0,$e,$e
___
$code.=<<___ if ($i>=15); # with forward Xupdate
addl $K,$e,$e ; $i
shd $a,$a,27,$t1
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
addl @X[$i%16],$e,$e
and $c,$b,$t0
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
addl $t1,$e,$e
andcm $d,$b,$t1
shd $b,$b,2,$b
or $t1,$t0,$t0
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
add $t0,$e,$e
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
addl $K,$e,$e
shd $a,$a,27,$t1
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
addl @X[$i%16],$e,$e
xor $b,$c,$t0
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
addl $t1,$e,$e
shd $b,$b,2,$b
xor $d,$t0,$t0
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
addl $t0,$e,$e
___
$code.=<<___ if ($i==79); # with context load
ldw 0($ctx),@X[0] ; $i
addl $K,$e,$e
shd $a,$a,27,$t1
ldw 4($ctx),@X[1]
addl @X[$i%16],$e,$e
xor $b,$c,$t0
ldw 8($ctx),@X[2]
addl $t1,$e,$e
shd $b,$b,2,$b
xor $d,$t0,$t0
ldw 12($ctx),@X[3]
addl $t0,$e,$e
ldw 16($ctx),@X[4]
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
shd $a,$a,27,$t1 ; $i
addl $K,$e,$e
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
xor $d,$c,$t0
addl @X[$i%16],$e,$e
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
and $b,$t0,$t0
addl $t1,$e,$e
shd $b,$b,2,$b
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
addl $t0,$e,$e
and $d,$c,$t1
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
addl $t1,$e,$e
___
}
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
sha1_block_data_order
.PROC
.CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
ldw 0($ctx),$A
ldw 4($ctx),$B
ldw 8($ctx),$C
ldw 12($ctx),$D
ldw 16($ctx),$E
extru $inp,31,2,$t0 ; t0=inp&3;
sh3addl $t0,%r0,$t0 ; t0*=8;
subi 32,$t0,$t0 ; t0=32-t0;
mtctl $t0,%cr11 ; %sar=t0;
L\$oop
ldi 3,$t0
andcm $inp,$t0,$t0 ; 64-bit neutral
___
for ($i=0;$i<15;$i++) { # load input block
$code.="\tldw `4*$i`($t0),@X[$i]\n"; }
$code.=<<___;
cmpb,*= $inp,$t0,L\$aligned
ldw 60($t0),@X[15]
ldw 64($t0),@X[16]
___
for ($i=0;$i<16;$i++) { # align input
$code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
$code.=<<___;
L\$aligned
ldil L'0x5a827000,$K ; K_00_19
ldo 0x999($K),$K
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0x6ed9e000,$K ; K_20_39
ldo 0xba1($K),$K
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0x8f1bb000,$K ; K_40_59
ldo 0xcdc($K),$K
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0xca62c000,$K ; K_60_79
ldo 0x1d6($K),$K
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
addl @X[0],$A,$A
addl @X[1],$B,$B
addl @X[2],$C,$C
addl @X[3],$D,$D
addl @X[4],$E,$E
stw $A,0($ctx)
stw $B,4($ctx)
stw $C,8($ctx)
stw $D,12($ctx)
stw $E,16($ctx)
addib,*<> -1,$num,L\$oop
ldo 64($inp),$inp
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/,\*/,/gm if ($SIZE_T==4);
$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
print $code;
close STDOUT;

View File

@@ -0,0 +1,344 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# I let hardware handle unaligned input(*), except on page boundaries
# (see below for details). Otherwise straightforward implementation
# with X vector in register bank.
#
# (*) this means that this module is inappropriate for PPC403? Does
# anybody know if pre-POWER3 can sustain unaligned load?
# -m64 -m32
# ----------------------------------
# PPC970,gcc-4.0.0 +76% +59%
# Power6,xlc-7 +68% +33%
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$UCMP ="cmplw";
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
} else { die "nonsense $flavour"; }
# Define endianess based on flavour
# i.e.: linux64le
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=24*$SIZE_T+64;
$LOCALS=6*$SIZE_T;
$K ="r0";
$sp ="r1";
$toc="r2";
$ctx="r3";
$inp="r4";
$num="r5";
$t0 ="r15";
$t1 ="r6";
$A ="r7";
$B ="r8";
$C ="r9";
$D ="r10";
$E ="r11";
$T ="r12";
@V=($A,$B,$C,$D,$E,$T);
@X=("r16","r17","r18","r19","r20","r21","r22","r23",
"r24","r25","r26","r27","r28","r29","r30","r31");
sub loadbe {
my ($dst, $src, $temp_reg) = @_;
$code.=<<___ if (!$LITTLE_ENDIAN);
lwz $dst,$src
___
$code.=<<___ if ($LITTLE_ENDIAN);
lwz $temp_reg,$src
rotlwi $dst,$temp_reg,8
rlwimi $dst,$temp_reg,24,0,7
rlwimi $dst,$temp_reg,24,16,23
___
}
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
# Since the last value of $f is discarded, we can use
# it as a temp reg to swap byte-order when needed.
loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
$code.=<<___ if ($i<15);
add $f,$K,$e
rotlwi $e,$a,5
add $f,$f,@X[$i]
and $t0,$c,$b
add $f,$f,$e
andc $t1,$d,$b
rotlwi $b,$b,30
or $t0,$t0,$t1
add $f,$f,$t0
___
$code.=<<___ if ($i>=15);
add $f,$K,$e
rotlwi $e,$a,5
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
add $f,$f,@X[$i%16]
and $t0,$c,$b
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
add $f,$f,$e
andc $t1,$d,$b
rotlwi $b,$b,30
or $t0,$t0,$t1
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
add $f,$f,$t0
rotlwi @X[$j%16],@X[$j%16],1
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
add $f,$K,$e
xor $t0,$b,$d
rotlwi $e,$a,5
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
add $f,$f,@X[$i%16]
xor $t0,$t0,$c
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
add $f,$f,$t0
rotlwi $b,$b,30
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
add $f,$f,$e
rotlwi @X[$j%16],@X[$j%16],1
___
$code.=<<___ if ($i==79);
add $f,$K,$e
xor $t0,$b,$d
rotlwi $e,$a,5
lwz r16,0($ctx)
add $f,$f,@X[$i%16]
xor $t0,$t0,$c
lwz r17,4($ctx)
add $f,$f,$t0
rotlwi $b,$b,30
lwz r18,8($ctx)
lwz r19,12($ctx)
add $f,$f,$e
lwz r20,16($ctx)
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___;
add $f,$K,$e
rotlwi $e,$a,5
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
add $f,$f,@X[$i%16]
and $t0,$b,$c
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
add $f,$f,$e
or $t1,$b,$c
rotlwi $b,$b,30
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
and $t1,$t1,$d
or $t0,$t0,$t1
rotlwi @X[$j%16],@X[$j%16],1
add $f,$f,$t0
___
}
$code=<<___;
.machine "any"
.text
.globl .sha1_block_data_order
.align 4
.sha1_block_data_order:
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $A,0($ctx)
lwz $B,4($ctx)
lwz $C,8($ctx)
lwz $D,12($ctx)
lwz $E,16($ctx)
andi. r0,$inp,3
bne Lunaligned
Laligned:
mtctr $num
bl Lsha1_block_private
b Ldone
; PowerPC specification allows an implementation to be ill-behaved
; upon unaligned access which crosses page boundary. "Better safe
; than sorry" principle makes me treat it specially. But I don't
; look for particular offending word, but rather for 64-byte input
; block which crosses the boundary. Once found that block is aligned
; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
andi. $t1,$t1,4095 ; distance to closest page boundary
srwi. $t1,$t1,6 ; t1/=64
beq Lcross_page
$UCMP $num,$t1
ble- Laligned ; didn't cross the page boundary
mtctr $t1
subfc $num,$t1,$num
bl Lsha1_block_private
Lcross_page:
li $t1,16
mtctr $t1
addi r20,$sp,$LOCALS ; spot within the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
lbz r18,2($inp)
lbz r19,3($inp)
addi $inp,$inp,4
stb r16,0(r20)
stb r17,1(r20)
stb r18,2(r20)
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
li $t1,1
addi $inp,$sp,$LOCALS
mtctr $t1
bl Lsha1_block_private
$POP $inp,`$FRAME-$SIZE_T*18`($sp)
addic. $num,$num,-1
bne- Lunaligned
Ldone:
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
___
# This is private block function, which uses tailored calling
# interface, namely upon entry SHA_CTX is pre-loaded to given
# registers and counter register contains amount of chunks to
# digest...
$code.=<<___;
.align 4
Lsha1_block_private:
___
$code.=<<___; # load K_00_19
lis $K,0x5a82
ori $K,$K,0x7999
___
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; # load K_20_39
lis $K,0x6ed9
ori $K,$K,0xeba1
___
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; # load K_40_59
lis $K,0x8f1b
ori $K,$K,0xbcdc
___
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; # load K_60_79
lis $K,0xca62
ori $K,$K,0xc1d6
___
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add r16,r16,$E
add r17,r17,$T
add r18,r18,$A
add r19,r19,$B
add r20,r20,$C
stw r16,0($ctx)
mr $A,r16
stw r17,4($ctx)
mr $B,r17
stw r18,8($ctx)
mr $C,r18
stw r19,12($ctx)
mr $D,r19
stw r20,16($ctx)
mr $E,r20
addi $inp,$inp,`16*4`
bdnz- Lsha1_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.size .sha1_block_data_order,.-.sha1_block_data_order
___
$code.=<<___;
.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,246 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for s390x.
# April 2007.
#
# Performance is >30% better than gcc 3.3 generated code. But the real
# twist is that SHA1 hardware support is detected and utilized. In
# which case performance can reach further >4.5x for larger chunks.
# January 2009.
#
# Optimize Xupdate for amount of memory references and reschedule
# instructions to favour dual-issue z10 pipeline. On z10 hardware is
# "only" ~2.3x faster than software.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific.
$kimdfunc=1; # magic function code for kimd instruction
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$K_00_39="%r0"; $K=$K_00_39;
$K_40_79="%r1";
$ctx="%r2"; $prefetch="%r2";
$inp="%r3";
$len="%r4";
$A="%r5";
$B="%r6";
$C="%r7";
$D="%r8";
$E="%r9"; @V=($A,$B,$C,$D,$E);
$t0="%r10";
$t1="%r11";
@X=("%r12","%r13","%r14");
$sp="%r15";
$stdframe=16*$SIZE_T+4*8;
$frame=$stdframe+16*4;
sub Xupdate {
my $i=shift;
$code.=<<___ if ($i==15);
lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up
lr $X[0],$X[2]
___
return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
$code.=<<___ if ($i<16);
lg $X[0],`$i*4`($inp) ### Xload($i)
rllg $X[1],$X[0],32
___
$code.=<<___ if ($i>=16);
xgr $X[0],$prefetch ### Xupdate($i)
lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
xgr $X[0],$prefetch
rll $X[0],$X[0],1
rllg $X[1],$X[0],32
rll $X[1],$X[1],1
rllg $X[0],$X[1],32
lr $X[2],$X[1] # feedback
___
$code.=<<___ if ($i<=70);
stg $X[0],`$stdframe+4*($i%16)`($sp)
___
unshift(@X,pop(@X));
}
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$d
xr $t0,$c
alr $e,$t1
nr $t0,$b
alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$b
alr $e,$t1
xr $t0,$c
alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$b
alr $e,$t1
or $t0,$c
lr $t1,$b
nr $t0,$d
nr $t1,$c
alr $e,$xi
or $t0,$t1
rll $b,$b,30
alr $e,$t0
___
}
$code.=<<___;
.text
.align 64
.type Ktable,\@object
Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
.skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
.size Ktable,.-Ktable
.globl sha1_block_data_order
.type sha1_block_data_order,\@function
sha1_block_data_order:
___
$code.=<<___ if ($kimdfunc);
larl %r1,OPENSSL_s390xcap_P
lg %r0,0(%r1)
tmhl %r0,0x4000 # check for message-security assist
jz .Lsoftware
lghi %r0,0
la %r1,`2*$SIZE_T`($sp)
.long 0xb93e0002 # kimd %r0,%r2
lg %r0,`2*$SIZE_T`($sp)
tmhh %r0,`0x8000>>$kimdfunc`
jz .Lsoftware
lghi %r0,$kimdfunc
lgr %r1,$ctx
lgr %r2,$inp
sllg %r3,$len,6
.long 0xb93e0002 # kimd %r0,%r2
brc 1,.-4 # pay attention to "partial completion"
br %r14
.align 16
.Lsoftware:
___
$code.=<<___;
lghi %r1,-$frame
st${g} $ctx,`2*$SIZE_T`($sp)
stm${g} %r6,%r15,`6*$SIZE_T`($sp)
lgr %r0,$sp
la $sp,0(%r1,$sp)
st${g} %r0,0($sp)
larl $t0,Ktable
llgf $A,0($ctx)
llgf $B,4($ctx)
llgf $C,8($ctx)
llgf $D,12($ctx)
llgf $E,16($ctx)
lg $K_00_39,0($t0)
lg $K_40_79,8($t0)
.Lloop:
rllg $K_00_39,$K_00_39,32
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
rllg $K_00_39,$K_00_39,32
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; $K=$K_40_79;
rllg $K_40_79,$K_40_79,32
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
rllg $K_40_79,$K_40_79,32
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
l${g} $ctx,`$frame+2*$SIZE_T`($sp)
la $inp,64($inp)
al $A,0($ctx)
al $B,4($ctx)
al $C,8($ctx)
al $D,12($ctx)
al $E,16($ctx)
st $A,0($ctx)
st $B,4($ctx)
st $C,8($ctx)
st $D,12($ctx)
st $E,16($ctx)
brct${g} $len,.Lloop
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size sha1_block_data_order,.-sha1_block_data_order
.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
.comm OPENSSL_s390xcap_P,16,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,427 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
# ====================================================================
# Performance improvement is not really impressive on pre-T1 CPU: +8%
# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
# X[16] vector is packed to 8 64-bit registers and as result nothing
# is spilled on stack. In addition input data is loaded in compact
# instruction sequence, thus minimizing the window when the code is
# subject to [inter-thread] cache-thrashing hazard. The goal is to
# ensure scalability on UltraSPARC T1, or rather to avoid decay when
# amount of active threads exceeds the number of physical cores.
# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
# faster than software. Multi-process benchmark saturates at 11x
# single-process result on 8-core processor, or ~9GBps per 2.85GHz
# socket.
$output=shift;
open STDOUT,">$output";
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
$rot1m="%g2";
$tmp64="%g3";
$Xi="%g4";
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
@V=($A,$B,$C,$D,$E);
$K_00_19="%l5";
$K_20_39="%l6";
$K_40_59="%l7";
$K_60_79="%g5";
@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
$ctx="%i0";
$inp="%i1";
$len="%i2";
$tmp0="%i3";
$tmp1="%i4";
$tmp2="%i5";
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=($i&1)?@X[($i/2)%8]:$Xi;
$code.=<<___;
sll $a,5,$tmp0 !! $i
add @K[$i/20],$e,$e
srl $a,27,$tmp1
add $tmp0,$e,$e
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
andn $d,$b,$tmp1
srl $b,2,$b
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $xi,$e,$e
___
if ($i&1 && $i<15) {
$code.=
" srlx @X[(($i+1)/2)%8],32,$Xi\n";
}
$code.=<<___;
add $tmp1,$e,$e
___
}
sub Xupdate {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i/2;
if ($i&1) {
$code.=<<___;
sll $a,5,$tmp0 !! $i
add @K[$i/20],$e,$e
srl $a,27,$tmp1
___
} else {
$code.=<<___;
sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
srlx @X[($j+7)%8],32,$tmp1
xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
sll $a,5,$tmp0 !! $i
or $tmp1,$Xi,$Xi
add @K[$i/20],$e,$e !!
xor $Xi,@X[$j%8],@X[$j%8]
srlx @X[$j%8],31,$Xi
add @X[$j%8],@X[$j%8],@X[$j%8]
and $Xi,$rot1m,$Xi
andn @X[$j%8],$rot1m,@X[$j%8]
srl $a,27,$tmp1 !!
or $Xi,@X[$j%8],@X[$j%8]
___
}
}
sub BODY_16_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
&Xupdate(@_);
if ($i&1) {
$xi=@X[($i/2)%8];
} else {
$xi=$Xi;
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
$code.=<<___;
add $tmp0,$e,$e !!
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
add $xi,$e,$e
andn $d,$b,$tmp1
srl $b,2,$b
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi;
&Xupdate(@_);
if ($i&1) {
$xi=@X[($i/2)%8];
} else {
$xi=$Xi;
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
$code.=<<___;
add $tmp0,$e,$e !!
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $xi,$e,$e
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi;
&Xupdate(@_);
if ($i&1) {
$xi=@X[($i/2)%8];
} else {
$xi=$Xi;
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
$code.=<<___;
add $tmp0,$e,$e !!
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
or $c,$b,$tmp1
srl $b,2,$b
and $d,$tmp1,$tmp1
add $xi,$e,$e
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
___
}
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
.align 32
.globl sha1_block_data_order
sha1_block_data_order:
SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
andcc %g1, CFR_SHA1, %g0
be .Lsoftware
nop
ld [%o0 + 0x00], %f0 ! load context
ld [%o0 + 0x04], %f1
ld [%o0 + 0x08], %f2
andcc %o1, 0x7, %g0
ld [%o0 + 0x0c], %f3
bne,pn %icc, .Lhwunaligned
ld [%o0 + 0x10], %f4
.Lhw_loop:
ldd [%o1 + 0x00], %f8
ldd [%o1 + 0x08], %f10
ldd [%o1 + 0x10], %f12
ldd [%o1 + 0x18], %f14
ldd [%o1 + 0x20], %f16
ldd [%o1 + 0x28], %f18
ldd [%o1 + 0x30], %f20
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x38], %f22
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
.word 0x81b02820 ! SHA1
bne,pt SIZE_T_CC, .Lhw_loop
nop
.Lhwfinish:
st %f0, [%o0 + 0x00] ! store context
st %f1, [%o0 + 0x04]
st %f2, [%o0 + 0x08]
st %f3, [%o0 + 0x0c]
retl
st %f4, [%o0 + 0x10]
.align 8
.Lhwunaligned:
alignaddr %o1, %g0, %o1
ldd [%o1 + 0x00], %f10
.Lhwunaligned_loop:
ldd [%o1 + 0x08], %f12
ldd [%o1 + 0x10], %f14
ldd [%o1 + 0x18], %f16
ldd [%o1 + 0x20], %f18
ldd [%o1 + 0x28], %f20
ldd [%o1 + 0x30], %f22
ldd [%o1 + 0x38], %f24
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x40], %f26
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
faligndata %f10, %f12, %f8
faligndata %f12, %f14, %f10
faligndata %f14, %f16, %f12
faligndata %f16, %f18, %f14
faligndata %f18, %f20, %f16
faligndata %f20, %f22, %f18
faligndata %f22, %f24, %f20
faligndata %f24, %f26, %f22
.word 0x81b02820 ! SHA1
bne,pt SIZE_T_CC, .Lhwunaligned_loop
for %f26, %f26, %f10 ! %f10=%f26
ba .Lhwfinish
nop
.align 16
.Lsoftware:
save %sp,-STACK_FRAME,%sp
sllx $len,6,$len
add $inp,$len,$len
or %g0,1,$rot1m
sllx $rot1m,32,$rot1m
or $rot1m,1,$rot1m
ld [$ctx+0],$A
ld [$ctx+4],$B
ld [$ctx+8],$C
ld [$ctx+12],$D
ld [$ctx+16],$E
andn $inp,7,$tmp0
sethi %hi(0x5a827999),$K_00_19
or $K_00_19,%lo(0x5a827999),$K_00_19
sethi %hi(0x6ed9eba1),$K_20_39
or $K_20_39,%lo(0x6ed9eba1),$K_20_39
sethi %hi(0x8f1bbcdc),$K_40_59
or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
sethi %hi(0xca62c1d6),$K_60_79
or $K_60_79,%lo(0xca62c1d6),$K_60_79
.Lloop:
ldx [$tmp0+0],@X[0]
ldx [$tmp0+16],@X[2]
ldx [$tmp0+32],@X[4]
ldx [$tmp0+48],@X[6]
and $inp,7,$tmp1
ldx [$tmp0+8],@X[1]
sll $tmp1,3,$tmp1
ldx [$tmp0+24],@X[3]
subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
ldx [$tmp0+40],@X[5]
bz,pt %icc,.Laligned
ldx [$tmp0+56],@X[7]
sllx @X[0],$tmp1,@X[0]
ldx [$tmp0+64],$tmp64
___
for($i=0;$i<7;$i++)
{ $code.=<<___;
srlx @X[$i+1],$tmp2,$Xi
sllx @X[$i+1],$tmp1,@X[$i+1]
or $Xi,@X[$i],@X[$i]
___
}
$code.=<<___;
srlx $tmp64,$tmp2,$tmp64
or $tmp64,@X[7],@X[7]
.Laligned:
srlx @X[0],32,$Xi
___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ld [$ctx+0],@X[0]
ld [$ctx+4],@X[1]
ld [$ctx+8],@X[2]
ld [$ctx+12],@X[3]
add $inp,64,$inp
ld [$ctx+16],@X[4]
cmp $inp,$len
add $A,@X[0],$A
st $A,[$ctx+0]
add $B,@X[1],$B
st $B,[$ctx+4]
add $C,@X[2],$C
st $C,[$ctx+8]
add $D,@X[3],$D
st $D,[$ctx+12]
add $E,@X[4],$E
st $E,[$ctx+16]
bne SIZE_T_CC,.Lloop
andn $inp,7,$tmp0
ret
restore
.type sha1_block_data_order,#function
.size sha1_block_data_order,(.-sha1_block_data_order)
.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my $ref,$opf;
my %visopf = ( "faligndata" => 0x048,
"for" => 0x07c );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%f([0-9]{1,2})/);
$_=$1;
if ($1>=32) {
return $ref if ($1&1);
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
sub unalignaddr {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my $ref="$mnemonic\t$rs1,$rs2,$rd";
foreach ($rs1,$rs2,$rd) {
if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
else { return $ref; }
}
return sprintf ".word\t0x%08x !%s",
0x81b00300|$rd<<25|$rs1<<14|$rs2,
$ref;
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
&unvis($1,$2,$3,$4)
/ge;
s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unalignaddr($1,$2,$3,$4)
/ge;
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,601 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2009
#
# Provided that UltraSPARC VIS instructions are pipe-lined(*) and
# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
# Graphic Unit would make it possible to achieve higher instruction-
# level parallelism, ILP, and thus higher performance. It should be
# explicitly noted that ILP is the keyword, and it means that this
# code would be unsuitable for cores like UltraSPARC-Tx. The idea is
# not really novel, Sun had VIS-powered implementation for a while.
# Unlike Sun's implementation this one can process multiple unaligned
# input blocks, and as such works as drop-in replacement for OpenSSL
# sha1_block_data_order. Performance improvement was measured to be
# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
# UltraSPARC-III. See below for discussion...
#
# The module does not present direct interest for OpenSSL, because
# it doesn't provide better performance on contemporary SPARCv9 CPUs,
# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
# absolutely must score on UltraSPARC-I-IV can simply replace
# crypto/sha/asm/sha1-sparcv9.pl with this module.
#
# (*) "Pipe-lined" means that even if it takes several cycles to
# complete, next instruction using same functional unit [but not
# depending on the result of the current instruction] can start
# execution without having to wait for the unit. "Pairable"
# means that two [or more] independent instructions can be
# issued at the very same time.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$output=shift;
open STDOUT,">$output";
$ctx="%i0";
$inp="%i1";
$len="%i2";
$tmp0="%i3";
$tmp1="%i4";
$tmp2="%i5";
$tmp3="%g5";
$base="%g1";
$align="%g4";
$Xfer="%o5";
$nXfer=$tmp3;
$Xi="%o7";
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
@V=($A,$B,$C,$D,$E);
$Actx="%o0";
$Bctx="%o1";
$Cctx="%o2";
$Dctx="%o3";
$Ectx="%o4";
$fmul="%f32";
$VK_00_19="%f34";
$VK_20_39="%f36";
$VK_40_59="%f38";
$VK_60_79="%f40";
@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
"%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
# This is reference 2x-parallelized VIS-powered Xupdate procedure. It
# covers even K_NN_MM addition...
sub Xupdate {
my ($i)=@_;
my $K=@VK[($i+16)/20];
my $j=($i+16)%16;
# [ provided that GSR.alignaddr_offset is 5, $mul contains
# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
# chosen registers... ]
$code.=<<___;
fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13]
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
![fxors %f15,%f2,%f2]
for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency
fpadd32 $K,@X[$j],%f20
std %f20,[$Xfer+`4*$j`]
___
# The numbers delimited with slash are the earliest possible dispatch
# cycles for given instruction assuming 1 cycle latency for simple VIS
# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
# round. As [long as] FPU/VIS instructions are perfectly pairable with
# IALU ones, the round timing is defined by the maximum between VIS
# and IALU timings. The latter varies from round to round and averages
# out at 6.25 ticks. This means that USI&II should operate at IALU
# rate, while USIII&IV - at VIS rate. This explains why performance
# improvement varies among processors. Well, given that pure IALU
# sha1-sparcv9.pl module exhibits virtually uniform performance of
# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
# lower limits. Real-life performance was measured to be 6.6 cycles
# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
# half-round VIS timing, because there are 16 Xupdate-free rounds,
# which "push down" average theoretical timing to 8 cycles...
# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS
# latency. Well, it might have, but it doesn't have dedicated
# VIS-unit. Instead, VIS instructions are executed by other
# functional units, ones used here - by IALU. This doesn't
# improve effective ILP...
}
# The reference Xupdate procedure is then "strained" over *pairs* of
# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
# plenty of room to amortize for read-after-write hazard, as well as
# to fetch and align input for the next spin. The VIS instructions are
# scheduled for latency of 2 cycles, because there are not enough IALU
# instructions to schedule for latency of 3, while scheduling for 1
# would give no gain on USI&II anyway.
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $k=($j+16+2)%16; # ahead reference
my $l=($j+16-2)%16; # behind reference
my $K=@VK[($j+16-2)/20];
$j=($j+16)%16;
$code.=<<___ if (!($i&1));
sll $a,5,$tmp0 !! $i
and $c,$b,$tmp3
ld [$Xfer+`4*($i%16)`],$Xi
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
srl $a,27,$tmp1
add $tmp0,$e,$e
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
sll $b,30,$tmp2
add $tmp1,$e,$e
andn $d,$b,$tmp1
add $Xi,$e,$e
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
srl $b,2,$b
or $tmp1,$tmp3,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
___
$code.=<<___ if ($i&1);
sll $a,5,$tmp0 !! $i
and $c,$b,$tmp3
ld [$Xfer+`4*($i%16)`],$Xi
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
srl $a,27,$tmp1
add $tmp0,$e,$e
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
sll $b,30,$tmp2
add $tmp1,$e,$e
fpadd32 $K,@X[$l],%f20 !
andn $d,$b,$tmp1
add $Xi,$e,$e
fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
srl $b,2,$b
or $tmp1,$tmp3,$tmp1
fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
or $tmp2,$b,$b
add $tmp1,$e,$e
___
$code.=<<___ if ($i&1 && $i>=2);
std %f20,[$Xfer+`4*$l`] !
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $k=($j+16+2)%16; # ahead reference
my $l=($j+16-2)%16; # behind reference
my $K=@VK[($j+16-2)/20];
$j=($j+16)%16;
$code.=<<___ if (!($i&1) && $i<64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
srl $a,27,$tmp1
add $tmp0,$e,$e
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
___
$code.=<<___ if ($i&1 && $i<64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
srl $a,27,$tmp1
add $tmp0,$e,$e
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
xor $c,$b,$tmp0
add $tmp1,$e,$e
fpadd32 $K,@X[$l],%f20 !
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
srl $b,2,$b
add $tmp1,$e,$e
fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
or $tmp2,$b,$b
add $Xi,$e,$e
std %f20,[$Xfer+`4*$l`] !
___
$code.=<<___ if ($i==64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fpadd32 $K,@X[$l],%f20
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
std %f20,[$Xfer+`4*$l`]
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
$code.=<<___ if ($i>64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $k=($j+16+2)%16; # ahead reference
my $l=($j+16-2)%16; # behind reference
my $K=@VK[($j+16-2)/20];
$j=($j+16)%16;
$code.=<<___ if (!($i&1));
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
srl $a,27,$tmp1
add $tmp0,$e,$e
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
or $c,$b,$tmp1
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
srl $b,2,$b
and $d,$tmp1,$tmp1
add $Xi,$e,$e
or $tmp1,$tmp0,$tmp1
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
or $tmp2,$b,$b
add $tmp1,$e,$e
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
___
$code.=<<___ if ($i&1);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
and $c,$b,$tmp0
add $tmp1,$e,$e
fpadd32 $K,@X[$l],%f20 !
sll $b,30,$tmp2
or $c,$b,$tmp1
fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
srl $b,2,$b
and $d,$tmp1,$tmp1
fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
add $Xi,$e,$e
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
std %f20,[$Xfer+`4*$l`] !
___
}
# If there is more data to process, then we pre-fetch the data for
# next iteration in last ten rounds...
sub BODY_70_79 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $m=($i%8)*2;
$j=($j+16)%16;
$code.=<<___ if ($i==70);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
ldd [$inp+64],@X[0]
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
and $inp,-64,$nXfer
inc 64,$inp
and $nXfer,255,$nXfer
alignaddr %g0,$align,%g0
add $base,$nXfer,$nXfer
___
$code.=<<___ if ($i==71);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
$code.=<<___ if ($i>=72);
faligndata @X[$m],@X[$m+2],@X[$m]
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
fpadd32 $VK_00_19,@X[$m],%f20
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
$code.=<<___ if ($i<77);
ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
___
$code.=<<___ if ($i==77); # redundant if $inp was aligned
add $align,63,$tmp0
and $tmp0,-8,$tmp0
ldd [$inp+$tmp0],@X[16]
___
$code.=<<___ if ($i>=72);
std %f20,[$nXfer+`4*$m`]
___
}
$code.=<<___;
.section ".text",#alloc,#execinstr
.align 64
vis_const:
.long 0x5a827999,0x5a827999 ! K_00_19
.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39
.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59
.long 0xca62c1d6,0xca62c1d6 ! K_60_79
.long 0x00000100,0x00000100
.align 64
.type vis_const,#object
.size vis_const,(.-vis_const)
.globl sha1_block_data_order
sha1_block_data_order:
save %sp,-$frame,%sp
add %fp,$bias-256,$base
1: call .+8
add %o7,vis_const-1b,$tmp0
ldd [$tmp0+0],$VK_00_19
ldd [$tmp0+8],$VK_20_39
ldd [$tmp0+16],$VK_40_59
ldd [$tmp0+24],$VK_60_79
ldd [$tmp0+32],$fmul
ld [$ctx+0],$Actx
and $base,-256,$base
ld [$ctx+4],$Bctx
sub $base,$bias+$frame,%sp
ld [$ctx+8],$Cctx
and $inp,7,$align
ld [$ctx+12],$Dctx
and $inp,-8,$inp
ld [$ctx+16],$Ectx
! X[16] is maintained in FP register bank
alignaddr %g0,$align,%g0
ldd [$inp+0],@X[0]
sub $inp,-64,$Xfer
ldd [$inp+8],@X[2]
and $Xfer,-64,$Xfer
ldd [$inp+16],@X[4]
and $Xfer,255,$Xfer
ldd [$inp+24],@X[6]
add $base,$Xfer,$Xfer
ldd [$inp+32],@X[8]
ldd [$inp+40],@X[10]
ldd [$inp+48],@X[12]
brz,pt $align,.Laligned
ldd [$inp+56],@X[14]
ldd [$inp+64],@X[16]
faligndata @X[0],@X[2],@X[0]
faligndata @X[2],@X[4],@X[2]
faligndata @X[4],@X[6],@X[4]
faligndata @X[6],@X[8],@X[6]
faligndata @X[8],@X[10],@X[8]
faligndata @X[10],@X[12],@X[10]
faligndata @X[12],@X[14],@X[12]
faligndata @X[14],@X[16],@X[14]
.Laligned:
mov 5,$tmp0
dec 1,$len
alignaddr %g0,$tmp0,%g0
fpadd32 $VK_00_19,@X[0],%f16
fpadd32 $VK_00_19,@X[2],%f18
fpadd32 $VK_00_19,@X[4],%f20
fpadd32 $VK_00_19,@X[6],%f22
fpadd32 $VK_00_19,@X[8],%f24
fpadd32 $VK_00_19,@X[10],%f26
fpadd32 $VK_00_19,@X[12],%f28
fpadd32 $VK_00_19,@X[14],%f30
std %f16,[$Xfer+0]
mov $Actx,$A
std %f18,[$Xfer+8]
mov $Bctx,$B
std %f20,[$Xfer+16]
mov $Cctx,$C
std %f22,[$Xfer+24]
mov $Dctx,$D
std %f24,[$Xfer+32]
mov $Ectx,$E
std %f26,[$Xfer+40]
fxors @X[13],@X[0],@X[0]
std %f28,[$Xfer+48]
ba .Loop
std %f30,[$Xfer+56]
.align 32
.Loop:
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
tst $len
bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail
nop
___
for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $A,$Actx,$Actx
add $B,$Bctx,$Bctx
add $C,$Cctx,$Cctx
add $D,$Dctx,$Dctx
add $E,$Ectx,$Ectx
mov 5,$tmp0
fxors @X[13],@X[0],@X[0]
mov $Actx,$A
mov $Bctx,$B
mov $Cctx,$C
mov $Dctx,$D
mov $Ectx,$E
alignaddr %g0,$tmp0,%g0
dec 1,$len
ba .Loop
mov $nXfer,$Xfer
.align 32
.Ltail:
___
for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $A,$Actx,$Actx
add $B,$Bctx,$Bctx
add $C,$Cctx,$Cctx
add $D,$Dctx,$Dctx
add $E,$Ectx,$Ectx
st $Actx,[$ctx+0]
st $Bctx,[$ctx+4]
st $Cctx,[$ctx+8]
st $Dctx,[$ctx+12]
st $Ectx,[$ctx+16]
ret
restore
.type sha1_block_data_order,#function
.size sha1_block_data_order,(.-sha1_block_data_order)
.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my ($ref,$opf);
my %visopf = ( "fmul8ulx16" => 0x037,
"faligndata" => 0x048,
"fpadd32" => 0x052,
"fxor" => 0x06c,
"fxors" => 0x06d );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%f([0-9]{1,2})/);
$_=$1;
if ($1>=32) {
return $ref if ($1&1);
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
sub unalignaddr {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my $ref="$mnemonic\t$rs1,$rs2,$rd";
foreach ($rs1,$rs2,$rd) {
if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
else { return $ref; }
}
return sprintf ".word\t0x%08x !%s",
0x81b00300|$rd<<25|$rs1<<14|$rs2,
$ref;
}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
&unvis($1,$2,$3,$4)
/gem;
$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
&unalignaddr($1,$2,$3,$4)
/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,259 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# sha1_block for Thumb.
#
# January 2007.
#
# The code does not present direct interest to OpenSSL, because of low
# performance. Its purpose is to establish _size_ benchmark. Pretty
# useless one I must say, because 30% or 88 bytes larger ARMv4 code
# [avialable on demand] is almost _twice_ as fast. It should also be
# noted that in-lining of .Lcommon and .Lrotate improves performance
# by over 40%, while code increases by only 10% or 32 bytes. But once
# again, the goal was to establish _size_ benchmark, not performance.
$output=shift;
open STDOUT,">$output";
$inline=0;
#$cheat_on_binutils=1;
$t0="r0";
$t1="r1";
$t2="r2";
$a="r3";
$b="r4";
$c="r5";
$d="r6";
$e="r7";
$K="r8"; # "upper" registers can be used in add/sub and mov insns
$ctx="r9";
$inp="r10";
$len="r11";
$Xi="r12";
sub common {
<<___;
sub $t0,#4
ldr $t1,[$t0]
add $e,$K @ E+=K_xx_xx
lsl $t2,$a,#5
add $t2,$e
lsr $e,$a,#27
add $t2,$e @ E+=ROR(A,27)
add $t2,$t1 @ E+=X[i]
___
}
sub rotate {
<<___;
mov $e,$d @ E=D
mov $d,$c @ D=C
lsl $c,$b,#30
lsr $b,$b,#2
orr $c,$b @ C=ROR(B,2)
mov $b,$a @ B=A
add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D)
___
}
sub BODY_00_19 {
$code.=$inline?&common():"\tbl .Lcommon\n";
$code.=<<___;
mov $t1,$c
eor $t1,$d
and $t1,$b
eor $t1,$d @ F_00_19(B,C,D)
___
$code.=$inline?&rotate():"\tbl .Lrotate\n";
}
sub BODY_20_39 {
$code.=$inline?&common():"\tbl .Lcommon\n";
$code.=<<___;
mov $t1,$b
eor $t1,$c
eor $t1,$d @ F_20_39(B,C,D)
___
$code.=$inline?&rotate():"\tbl .Lrotate\n";
}
sub BODY_40_59 {
$code.=$inline?&common():"\tbl .Lcommon\n";
$code.=<<___;
mov $t1,$b
and $t1,$c
mov $e,$b
orr $e,$c
and $e,$d
orr $t1,$e @ F_40_59(B,C,D)
___
$code.=$inline?&rotate():"\tbl .Lrotate\n";
}
$code=<<___;
.text
.code 16
.global sha1_block_data_order
.type sha1_block_data_order,%function
.align 2
sha1_block_data_order:
___
if ($cheat_on_binutils) {
$code.=<<___;
.code 32
add r3,pc,#1
bx r3 @ switch to Thumb ISA
.code 16
___
}
$code.=<<___;
push {r4-r7}
mov r3,r8
mov r4,r9
mov r5,r10
mov r6,r11
mov r7,r12
push {r3-r7,lr}
lsl r2,#6
mov $ctx,r0 @ save context
mov $inp,r1 @ save inp
mov $len,r2 @ save len
add $len,$inp @ $len to point at inp end
.Lloop:
mov $Xi,sp
mov $t2,sp
sub $t2,#16*4 @ [3]
.LXload:
ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp
ldrb $b,[$t1,#1]
ldrb $c,[$t1,#2]
ldrb $d,[$t1,#3]
lsl $a,#24
lsl $b,#16
lsl $c,#8
orr $a,$b
orr $a,$c
orr $a,$d
add $t1,#4
push {$a}
cmp sp,$t2
bne .LXload @ [+14*16]
mov $inp,$t1 @ update $inp
sub $t2,#32*4
sub $t2,#32*4
mov $e,#31 @ [+4]
.LXupdate:
ldr $a,[sp,#15*4]
ldr $b,[sp,#13*4]
ldr $c,[sp,#7*4]
ldr $d,[sp,#2*4]
eor $a,$b
eor $a,$c
eor $a,$d
ror $a,$e
push {$a}
cmp sp,$t2
bne .LXupdate @ [+(11+1)*64]
ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx
mov $t0,$Xi
ldr $t2,.LK_00_19
mov $t1,$t0
sub $t1,#20*4
mov $Xi,$t1
mov $K,$t2 @ [+7+4]
.L_00_19:
___
&BODY_00_19();
$code.=<<___;
cmp $Xi,$t0
bne .L_00_19 @ [+(2+9+4+2+8+2)*20]
ldr $t2,.LK_20_39
mov $t1,$t0
sub $t1,#20*4
mov $Xi,$t1
mov $K,$t2 @ [+5]
.L_20_39_or_60_79:
___
&BODY_20_39();
$code.=<<___;
cmp $Xi,$t0
bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2]
cmp sp,$t0
beq .Ldone @ [+2]
ldr $t2,.LK_40_59
mov $t1,$t0
sub $t1,#20*4
mov $Xi,$t1
mov $K,$t2 @ [+5]
.L_40_59:
___
&BODY_40_59();
$code.=<<___;
cmp $Xi,$t0
bne .L_40_59 @ [+(2+9+6+2+8+2)*20]
ldr $t2,.LK_60_79
mov $Xi,sp
mov $K,$t2
b .L_20_39_or_60_79 @ [+4]
.Ldone:
mov $t0,$ctx
ldr $t1,[$t0,#0]
ldr $t2,[$t0,#4]
add $a,$t1
ldr $t1,[$t0,#8]
add $b,$t2
ldr $t2,[$t0,#12]
add $c,$t1
ldr $t1,[$t0,#16]
add $d,$t2
add $e,$t1
stmia $t0!,{$a,$b,$c,$d,$e} @ [+20]
add sp,#80*4 @ deallocate stack frame
mov $t0,$ctx @ restore ctx
mov $t1,$inp @ restore inp
cmp $t1,$len
beq .Lexit
b .Lloop @ [+6] total 3212 cycles
.Lexit:
pop {r2-r7}
mov r8,r2
mov r9,r3
mov r10,r4
mov r11,r5
mov r12,r6
mov lr,r7
pop {r4-r7}
bx lr
.align 2
___
$code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline);
$code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline);
$code.=<<___;
.align 2
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
.size sha1_block_data_order,.-sha1_block_data_order
.asciz "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
close STDOUT; # enforce flush

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,713 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Permission to use under GPL terms is granted.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
# byte [on single-issue Xscale PXA250 core].
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
# September 2013.
#
# Add NEON implementation. On Cortex A8 it was measured to process one
# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
# code (meaning that latter performs sub-optimally, nothing was done
# about it).
# May 2014.
#
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0"; $t0="r0";
$inp="r1"; $t4="r1";
$len="r2"; $t1="r2";
$T1="r3"; $t3="r3";
$A="r4";
$B="r5";
$C="r6";
$D="r7";
$E="r8";
$F="r9";
$G="r10";
$H="r11";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
$t2="r12";
$Ktbl="r14";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
#if __ARM_ARCH__>=7
@ ldr $t1,[$inp],#4 @ $i
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
rev $t1,$t1
#else
@ ldrb $t1,[$inp,#3] @ $i
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
ldrb $t2,[$inp,#2]
ldrb $t0,[$inp,#1]
orr $t1,$t1,$t2,lsl#8
ldrb $t2,[$inp],#4
orr $t1,$t1,$t0,lsl#16
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
orr $t1,$t1,$t2,lsl#24
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
#endif
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
add $h,$h,$t1 @ h+=X[i]
str $t1,[sp,#`$i%16`*4]
eor $t1,$f,$g
add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
and $t1,$t1,$e
add $h,$h,$t2 @ h+=K256[i]
eor $t1,$t1,$g @ Ch(e,f,g)
eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
add $h,$h,$t1 @ h+=Ch(e,f,g)
#if $i==31
and $t2,$t2,#0xff
cmp $t2,#0xf2 @ done?
#endif
#if $i<15
# if __ARM_ARCH__>=7
ldr $t1,[$inp],#4 @ prefetch
# else
ldrb $t1,[$inp,#3]
# endif
eor $t2,$a,$b @ a^b, b^c in next round
#else
ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
eor $t2,$a,$b @ a^b, b^c in next round
ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
#endif
eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
and $t3,$t3,$t2 @ (b^c)&=(a^b)
add $d,$d,$h @ d+=h
eor $t3,$t3,$b @ Maj(a,b,c)
add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
@ add $h,$h,$t3 @ h+=Maj(a,b,c)
___
($t2,$t3)=($t3,$t2);
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
@ ldr $t4,[sp,#`($i+14)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
mov $t2,$t4,ror#$sigma1[0]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t2,$t2,$t4,ror#$sigma1[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
ldr $t1,[sp,#`($i+0)%16`*4]
eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
ldr $t4,[sp,#`($i+9)%16`*4]
add $t2,$t2,$t0
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
add $t1,$t1,$t2
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
add $t1,$t1,$t4 @ X[i]
___
&BODY_00_15(@_);
}
$code=<<___;
#ifndef __KERNEL__
# include "arm_arch.h"
#else
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
#endif
.text
#if __ARM_ARCH__<7
.code 32
#else
.syntax unified
# ifdef __thumb2__
.thumb
# else
.code 32
# endif
#endif
.type K256,%object
.align 5
K256:
.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha256_block_data_order
#endif
.align 5
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha256_block_data_order
#else
adr r3,sha256_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#ARMV8_SHA256
bne .LARMv8
tst r12,#ARMV7_NEON
bne .LNEON
#endif
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256+32 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
# if __ARM_ARCH__>=7
ldr $t1,[$inp],#4
# else
ldrb $t1,[$inp,#3]
# endif
eor $t3,$B,$C @ magic
eor $t2,$t2,$t2
___
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
#if __ARM_ARCH__>=7
ite eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq $t3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t0,[$t3,#0]
ldr $t1,[$t3,#4]
ldr $t2,[$t3,#8]
add $A,$A,$t0
ldr $t0,[$t3,#12]
add $B,$B,$t1
ldr $t1,[$t3,#16]
add $C,$C,$t2
ldr $t2,[$t3,#20]
add $D,$D,$t0
ldr $t0,[$t3,#24]
add $E,$E,$t1
ldr $t1,[$t3,#28]
add $F,$F,$t2
ldr $inp,[sp,#17*4] @ pull inp
ldr $t2,[sp,#18*4] @ pull inp+len
add $G,$G,$t0
add $H,$H,$t1
stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
cmp $inp,$t2
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha256_block_data_order,.-sha256_block_data_order
___
######################################################################
# NEON stuff
#
{{{
my @X=map("q$_",(0..3));
my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
my $Xfer=$t4;
my $j=0;
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub Xupdate()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T2,$T0,$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T1,$T0,$sigma0[2]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T2,$T0,32-$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T3,$T0,$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T2);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T3,$T0,32-$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T3); # sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
while($#insns>=2) { eval(shift(@insns)); }
&vst1_32 ("{$T0}","[$Xfer,:128]!");
eval(shift(@insns));
eval(shift(@insns));
push(@X,shift(@X)); # "rotate" X[]
}
sub Xpreload()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vrev32_8 (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
foreach (@insns) { eval; } # remaining instructions
&vst1_32 ("{$T0}","[$Xfer,:128]!");
push(@X,shift(@X)); # "rotate" X[]
}
sub body_00_15 () {
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
'&eor ($t1,$f,$g)',
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
'&and ($t1,$t1,$e)',
'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
'&eor ($t1,$t1,$g)', # Ch(e,f,g)
'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
'&eor ($t2,$a,$b)', # a^b, b^c in next round
'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
'&ldr ($t1,"[sp,#64]") if ($j==31)',
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
'&add ($d,$d,$h)', # d+=h
'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
)
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
.align 4
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
sub $H,sp,#16*4+16
adr $Ktbl,K256
bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
mov sp,$H @ alloca
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
vld1.8 {@X[0]},[$inp]!
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
vld1.32 {$T0},[$Ktbl,:128]!
vld1.32 {$T1},[$Ktbl,:128]!
vld1.32 {$T2},[$Ktbl,:128]!
vld1.32 {$T3},[$Ktbl,:128]!
vrev32.8 @X[0],@X[0] @ yes, even on
str $ctx,[sp,#64]
vrev32.8 @X[1],@X[1] @ big-endian
str $inp,[sp,#68]
mov $Xfer,sp
vrev32.8 @X[2],@X[2]
str $len,[sp,#72]
vrev32.8 @X[3],@X[3]
str $t2,[sp,#76] @ save original sp
vadd.i32 $T0,$T0,@X[0]
vadd.i32 $T1,$T1,@X[1]
vst1.32 {$T0},[$Xfer,:128]!
vadd.i32 $T2,$T2,@X[2]
vst1.32 {$T1},[$Xfer,:128]!
vadd.i32 $T3,$T3,@X[3]
vst1.32 {$T2},[$Xfer,:128]!
vst1.32 {$T3},[$Xfer,:128]!
ldmia $ctx,{$A-$H}
sub $Xfer,$Xfer,#64
ldr $t1,[sp,#0]
eor $t2,$t2,$t2
eor $t3,$B,$C
b .L_00_48
.align 4
.L_00_48:
___
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
$code.=<<___;
teq $t1,#0 @ check for K256 terminator
ldr $t1,[sp,#0]
sub $Xfer,$Xfer,#64
bne .L_00_48
ldr $inp,[sp,#68]
ldr $t0,[sp,#72]
sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
teq $inp,$t0
it eq
subeq $inp,$inp,#64 @ avoid SEGV
vld1.8 {@X[0]},[$inp]! @ load next input block
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
it ne
strne $inp,[sp,#68]
mov $Xfer,sp
___
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
$code.=<<___;
ldr $t0,[$t1,#0]
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t2,[$t1,#4]
ldr $t3,[$t1,#8]
ldr $t4,[$t1,#12]
add $A,$A,$t0 @ accumulate
ldr $t0,[$t1,#16]
add $B,$B,$t2
ldr $t2,[$t1,#20]
add $C,$C,$t3
ldr $t3,[$t1,#24]
add $D,$D,$t4
ldr $t4,[$t1,#28]
add $E,$E,$t0
str $A,[$t1],#4
add $F,$F,$t2
str $B,[$t1],#4
add $G,$G,$t3
str $C,[$t1],#4
add $H,$H,$t4
str $D,[$t1],#4
stmia $t1,{$E-$H}
ittte ne
movne $Xfer,sp
ldrne $t1,[sp,#0]
eorne $t2,$t2,$t2
ldreq sp,[sp,#76] @ restore original sp
itt ne
eorne $t3,$B,$C
bne .L_00_48
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
___
}}}
######################################################################
# ARMv8 stuff
#
{{{
my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
my @MSG=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
my $Ktbl="r3";
$code.=<<___;
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
# ifdef __thumb2__
# define INST(a,b,c,d) .byte c,d|0xc,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d
# endif
.type sha256_block_data_order_armv8,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {$ABCD,$EFGH},[$ctx]
# ifdef __thumb2__
adr $Ktbl,.LARMv8
sub $Ktbl,$Ktbl,#.LARMv8-K256
# else
adrl $Ktbl,K256
# endif
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vld1.32 {$W0},[$Ktbl]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vrev32.8 @MSG[2],@MSG[2]
vrev32.8 @MSG[3],@MSG[3]
vmov $ABCD_SAVE,$ABCD @ offload
vmov $EFGH_SAVE,$EFGH
teq $inp,$len
___
for($i=0;$i<12;$i++) {
$code.=<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vld1.32 {$W0},[$Ktbl]!
vadd.i32 $W1,$W1,@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vld1.32 {$W1},[$Ktbl]
vadd.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#256-16 @ rewind
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vadd.i32 $W1,$W1,@MSG[3]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
it ne
bne .Loop_v8
vst1.32 {$ABCD,$EFGH},[$ctx]
ret @ bx lr
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
#endif
___
}}}
$code.=<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
#endif
___
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
{ my %opcode = (
"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
}
foreach (split($/,$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,911 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA512 block transform for x86. September 2007.
#
# May 2013.
#
# Add SSSE3 code path, 20-25% improvement [over original SSE2 code].
#
# Performance in clock cycles per processed byte (less is better):
#
# gcc icc x86 asm SIMD(*) x86_64(**)
# Pentium 100 97 61 - -
# PIII 75 77 56 - -
# P4 116 95 82 34.6 30.8
# AMD K8 54 55 36 20.7 9.57
# Core2 66 57 40 15.9 9.97
# Westmere 70 - 38 12.2 9.58
# Sandy Bridge 58 - 35 11.9 11.2
# Ivy Bridge 50 - 33 11.5 8.17
# Haswell 46 - 29 11.3 7.66
# Bulldozer 121 - 50 14.0 13.5
# VIA Nano 91 - 52 33 14.7
# Atom 126 - 68 48(***) 14.7
# Silvermont 97 - 58 42(***) 17.5
#
# (*) whichever best applicable.
# (**) x86_64 assembler performance is presented for reference
# purposes, the results are for integer-only code.
# (***) paddq is increadibly slow on Atom.
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
# to 50%, but it's less important as they are expected to execute SSE2
# code-path, which is commonly ~2-3x faster [than compiler generated
# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
# though it does not use 128-bit operations. The latter means that
# SSE2-aware kernel is no longer required to execute the code. Another
# difference is that new code optimizes amount of writes, but at the
# cost of increased data cache "footprint" by 1/2KB.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
$K512="ebp";
$Asse2=&QWP(0,"esp");
$Bsse2=&QWP(8,"esp");
$Csse2=&QWP(16,"esp");
$Dsse2=&QWP(24,"esp");
$Esse2=&QWP(32,"esp");
$Fsse2=&QWP(40,"esp");
$Gsse2=&QWP(48,"esp");
$Hsse2=&QWP(56,"esp");
$A="mm0"; # B-D and
$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
# mm5-mm7, but it's done on on-demand basis...
$BxC="mm2"; # ... except for B^C
sub BODY_00_15_sse2 {
my $phase=shift;
#&movq ("mm5",$Fsse2); # load f
#&movq ("mm6",$Gsse2); # load g
&movq ("mm1",$E); # %mm1 is sliding right
&pxor ("mm5","mm6"); # f^=g
&psrlq ("mm1",14);
&movq ($Esse2,$E); # modulo-scheduled save e
&pand ("mm5",$E); # f&=e
&psllq ($E,23); # $E is sliding left
&movq ($A,"mm3") if ($phase<2);
&movq (&QWP(8*9,"esp"),"mm7") # save X[i]
&movq ("mm3","mm1"); # %mm3 is T1
&psrlq ("mm1",4);
&pxor ("mm5","mm6"); # Ch(e,f,g)
&pxor ("mm3",$E);
&psllq ($E,23);
&pxor ("mm3","mm1");
&movq ($Asse2,$A); # modulo-scheduled save a
&paddq ("mm7","mm5"); # X[i]+=Ch(e,f,g)
&pxor ("mm3",$E);
&psrlq ("mm1",23);
&paddq ("mm7",$Hsse2); # X[i]+=h
&pxor ("mm3","mm1");
&psllq ($E,4);
&paddq ("mm7",QWP(0,$K512)); # X[i]+=K512[i]
&pxor ("mm3",$E); # T1=Sigma1_512(e)
&movq ($E,$Dsse2); # e = load d, e in next round
&paddq ("mm3","mm7"); # T1+=X[i]
&movq ("mm5",$A); # %mm5 is sliding right
&psrlq ("mm5",28);
&paddq ($E,"mm3"); # d += T1
&movq ("mm6",$A); # %mm6 is sliding left
&movq ("mm7","mm5");
&psllq ("mm6",25);
&movq ("mm1",$Bsse2); # load b
&psrlq ("mm5",6);
&pxor ("mm7","mm6");
&sub ("esp",8);
&psllq ("mm6",5);
&pxor ("mm7","mm5");
&pxor ($A,"mm1"); # a^b, b^c in next round
&psrlq ("mm5",5);
&pxor ("mm7","mm6");
&pand ($BxC,$A); # (b^c)&(a^b)
&psllq ("mm6",6);
&pxor ("mm7","mm5");
&pxor ($BxC,"mm1"); # [h=]Maj(a,b,c)
&pxor ("mm6","mm7"); # Sigma0_512(a)
&movq ("mm7",&QWP(8*(9+16-1),"esp")) if ($phase!=0); # pre-fetch
&movq ("mm5",$Fsse2) if ($phase==0); # load f
if ($phase>1) {
&paddq ($BxC,"mm6"); # h+=Sigma0(a)
&add ($K512,8);
#&paddq ($BxC,"mm3"); # h+=T1
($A,$BxC) = ($BxC,$A); # rotate registers
} else {
&paddq ("mm3",$BxC); # T1+=Maj(a,b,c)
&movq ($BxC,$A);
&add ($K512,8);
&paddq ("mm3","mm6"); # T1+=Sigma0(a)
&movq ("mm6",$Gsse2) if ($phase==0); # load g
#&movq ($A,"mm3"); # h=T1
}
}
sub BODY_00_15_x86 {
#define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
# LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
# HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
&mov ("ecx",$Elo);
&mov ("edx",$Ehi);
&mov ("esi","ecx");
&shr ("ecx",9); # lo>>9
&mov ("edi","edx");
&shr ("edx",9); # hi>>9
&mov ("ebx","ecx");
&shl ("esi",14); # lo<<14
&mov ("eax","edx");
&shl ("edi",14); # hi<<14
&xor ("ebx","esi");
&shr ("ecx",14-9); # lo>>14
&xor ("eax","edi");
&shr ("edx",14-9); # hi>>14
&xor ("eax","ecx");
&shl ("esi",18-14); # lo<<18
&xor ("ebx","edx");
&shl ("edi",18-14); # hi<<18
&xor ("ebx","esi");
&shr ("ecx",18-14); # lo>>18
&xor ("eax","edi");
&shr ("edx",18-14); # hi>>18
&xor ("eax","ecx");
&shl ("esi",23-18); # lo<<23
&xor ("ebx","edx");
&shl ("edi",23-18); # hi<<23
&xor ("eax","esi");
&xor ("ebx","edi"); # T1 = Sigma1(e)
&mov ("ecx",$Flo);
&mov ("edx",$Fhi);
&mov ("esi",$Glo);
&mov ("edi",$Ghi);
&add ("eax",$Hlo);
&adc ("ebx",$Hhi); # T1 += h
&xor ("ecx","esi");
&xor ("edx","edi");
&and ("ecx",$Elo);
&and ("edx",$Ehi);
&add ("eax",&DWP(8*(9+15)+0,"esp"));
&adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
&xor ("ecx","esi");
&xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g
&mov ("esi",&DWP(0,$K512));
&mov ("edi",&DWP(4,$K512)); # K[i]
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += Ch(e,f,g)
&mov ("ecx",$Dlo);
&mov ("edx",$Dhi);
&add ("eax","esi");
&adc ("ebx","edi"); # T1 += K[i]
&mov ($Tlo,"eax");
&mov ($Thi,"ebx"); # put T1 away
&add ("eax","ecx");
&adc ("ebx","edx"); # d += T1
#define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
# LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
# HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
&mov ("ecx",$Alo);
&mov ("edx",$Ahi);
&mov ($Dlo,"eax");
&mov ($Dhi,"ebx");
&mov ("esi","ecx");
&shr ("ecx",2); # lo>>2
&mov ("edi","edx");
&shr ("edx",2); # hi>>2
&mov ("ebx","ecx");
&shl ("esi",4); # lo<<4
&mov ("eax","edx");
&shl ("edi",4); # hi<<4
&xor ("ebx","esi");
&shr ("ecx",7-2); # lo>>7
&xor ("eax","edi");
&shr ("edx",7-2); # hi>>7
&xor ("ebx","ecx");
&shl ("esi",25-4); # lo<<25
&xor ("eax","edx");
&shl ("edi",25-4); # hi<<25
&xor ("eax","esi");
&shr ("ecx",28-7); # lo>>28
&xor ("ebx","edi");
&shr ("edx",28-7); # hi>>28
&xor ("eax","ecx");
&shl ("esi",30-25); # lo<<30
&xor ("ebx","edx");
&shl ("edi",30-25); # hi<<30
&xor ("eax","esi");
&xor ("ebx","edi"); # Sigma0(a)
&mov ("ecx",$Alo);
&mov ("edx",$Ahi);
&mov ("esi",$Blo);
&mov ("edi",$Bhi);
&add ("eax",$Tlo);
&adc ("ebx",$Thi); # T1 = Sigma0(a)+T1
&or ("ecx","esi");
&or ("edx","edi");
&and ("ecx",$Clo);
&and ("edx",$Chi);
&and ("esi",$Alo);
&and ("edi",$Ahi);
&or ("ecx","esi");
&or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b)
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += Maj(a,b,c)
&mov ($Tlo,"eax");
&mov ($Thi,"ebx");
&mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K
&sub ("esp",8);
&lea ($K512,&DWP(8,$K512)); # K++
}
&function_begin("sha512_block_data_order");
&mov ("esi",wparam(0)); # ctx
&mov ("edi",wparam(1)); # inp
&mov ("eax",wparam(2)); # num
&mov ("ebx","esp"); # saved sp
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($K512);
&lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
&sub ("esp",16);
&and ("esp",-64);
&shl ("eax",7);
&add ("eax","edi");
&mov (&DWP(0,"esp"),"esi"); # ctx
&mov (&DWP(4,"esp"),"edi"); # inp
&mov (&DWP(8,"esp"),"eax"); # inp+num*128
&mov (&DWP(12,"esp"),"ebx"); # saved sp
if ($sse2) {
&picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
&mov ("ecx",&DWP(0,"edx"));
&test ("ecx",1<<26);
&jz (&label("loop_x86"));
&mov ("edx",&DWP(4,"edx"));
# load ctx->h[0-7]
&movq ($A,&QWP(0,"esi"));
&and ("ecx",1<<24); # XMM registers availability
&movq ("mm1",&QWP(8,"esi"));
&and ("edx",1<<9); # SSSE3 bit
&movq ($BxC,&QWP(16,"esi"));
&or ("ecx","edx");
&movq ("mm3",&QWP(24,"esi"));
&movq ($E,&QWP(32,"esi"));
&movq ("mm5",&QWP(40,"esi"));
&movq ("mm6",&QWP(48,"esi"));
&movq ("mm7",&QWP(56,"esi"));
&cmp ("ecx",1<<24|1<<9);
&je (&label("SSSE3"));
&sub ("esp",8*10);
&jmp (&label("loop_sse2"));
&set_label("loop_sse2",16);
#&movq ($Asse2,$A);
&movq ($Bsse2,"mm1");
&movq ($Csse2,$BxC);
&movq ($Dsse2,"mm3");
#&movq ($Esse2,$E);
&movq ($Fsse2,"mm5");
&movq ($Gsse2,"mm6");
&pxor ($BxC,"mm1"); # magic
&movq ($Hsse2,"mm7");
&movq ("mm3",$A); # magic
&mov ("eax",&DWP(0,"edi"));
&mov ("ebx",&DWP(4,"edi"));
&add ("edi",8);
&mov ("edx",15); # counter
&bswap ("eax");
&bswap ("ebx");
&jmp (&label("00_14_sse2"));
&set_label("00_14_sse2",16);
&movd ("mm1","eax");
&mov ("eax",&DWP(0,"edi"));
&movd ("mm7","ebx");
&mov ("ebx",&DWP(4,"edi"));
&add ("edi",8);
&bswap ("eax");
&bswap ("ebx");
&punpckldq("mm7","mm1");
&BODY_00_15_sse2();
&dec ("edx");
&jnz (&label("00_14_sse2"));
&movd ("mm1","eax");
&movd ("mm7","ebx");
&punpckldq("mm7","mm1");
&BODY_00_15_sse2(1);
&pxor ($A,$A); # A is in %mm3
&mov ("edx",32); # counter
&jmp (&label("16_79_sse2"));
&set_label("16_79_sse2",16);
for ($j=0;$j<2;$j++) { # 2x unroll
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
&movq ("mm5",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm7");
&psrlq ("mm7",1);
&movq ("mm6","mm5");
&psrlq ("mm5",6);
&psllq ("mm1",56);
&paddq ($A,"mm3"); # from BODY_00_15
&movq ("mm3","mm7");
&psrlq ("mm7",7-1);
&pxor ("mm3","mm1");
&psllq ("mm1",63-56);
&pxor ("mm3","mm7");
&psrlq ("mm7",8-7);
&pxor ("mm3","mm1");
&movq ("mm1","mm5");
&psrlq ("mm5",19-6);
&pxor ("mm7","mm3"); # sigma0
&psllq ("mm6",3);
&pxor ("mm1","mm5");
&paddq ("mm7",&QWP(8*(9+16),"esp"));
&pxor ("mm1","mm6");
&psrlq ("mm5",61-19);
&paddq ("mm7",&QWP(8*(9+16-9),"esp"));
&pxor ("mm1","mm5");
&psllq ("mm6",45-3);
&movq ("mm5",$Fsse2); # load f
&pxor ("mm1","mm6"); # sigma1
&movq ("mm6",$Gsse2); # load g
&paddq ("mm7","mm1"); # X[i]
#&movq (&QWP(8*9,"esp"),"mm7"); # moved to BODY_00_15
&BODY_00_15_sse2(2);
}
&dec ("edx");
&jnz (&label("16_79_sse2"));
#&movq ($A,$Asse2);
&paddq ($A,"mm3"); # from BODY_00_15
&movq ("mm1",$Bsse2);
#&movq ($BxC,$Csse2);
&movq ("mm3",$Dsse2);
#&movq ($E,$Esse2);
&movq ("mm5",$Fsse2);
&movq ("mm6",$Gsse2);
&movq ("mm7",$Hsse2);
&pxor ($BxC,"mm1"); # de-magic
&paddq ($A,&QWP(0,"esi"));
&paddq ("mm1",&QWP(8,"esi"));
&paddq ($BxC,&QWP(16,"esi"));
&paddq ("mm3",&QWP(24,"esi"));
&paddq ($E,&QWP(32,"esi"));
&paddq ("mm5",&QWP(40,"esi"));
&paddq ("mm6",&QWP(48,"esi"));
&paddq ("mm7",&QWP(56,"esi"));
&mov ("eax",8*80);
&movq (&QWP(0,"esi"),$A);
&movq (&QWP(8,"esi"),"mm1");
&movq (&QWP(16,"esi"),$BxC);
&movq (&QWP(24,"esi"),"mm3");
&movq (&QWP(32,"esi"),$E);
&movq (&QWP(40,"esi"),"mm5");
&movq (&QWP(48,"esi"),"mm6");
&movq (&QWP(56,"esi"),"mm7");
&lea ("esp",&DWP(0,"esp","eax")); # destroy frame
&sub ($K512,"eax"); # rewind K
&cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
&jb (&label("loop_sse2"));
&mov ("esp",&DWP(8*10+12,"esp")); # restore sp
&emms ();
&function_end_A();
&set_label("SSSE3",32);
{ my ($cnt,$frame)=("ecx","edx");
my @X=map("xmm$_",(0..7));
my $j;
my $i=0;
&lea ($frame,&DWP(-64,"esp"));
&sub ("esp",256);
# fixed stack frame layout
#
# +0 A B C D E F G H # backing store
# +64 X[0]+K[i] .. X[15]+K[i] # XMM->MM xfer area
# +192 # XMM off-load ring buffer
# +256 # saved parameters
&movdqa (@X[1],&QWP(80*8,$K512)); # byte swap mask
&movdqu (@X[0],&QWP(0,"edi"));
&pshufb (@X[0],@X[1]);
for ($j=0;$j<8;$j++) {
&movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load
&movdqa (@X[3],&QWP(16*($j%8),$K512));
&movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask
&movdqu (@X[1],&QWP(16*($j+1),"edi")) if ($j<7); # next input
&movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0]
&paddq (@X[3],@X[0]);
&pshufb (@X[1],@X[2]) if ($j<7);
&movdqa (&QWP(16*($j%8)-128,$frame),@X[3]); # xfer X[i]+K[i]
push(@X,shift(@X)); # rotate(@X)
}
#&jmp (&label("loop_ssse3"));
&nop ();
&set_label("loop_ssse3",32);
&movdqa (@X[2],&QWP(16*(($j+1)%4),$frame)); # pre-restore @X[1]
&movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]); # off-load @X[3]
&lea ($K512,&DWP(16*8,$K512));
#&movq ($Asse2,$A); # off-load A-H
&movq ($Bsse2,"mm1");
&mov ("ebx","edi");
&movq ($Csse2,$BxC);
&lea ("edi",&DWP(128,"edi")); # advance input
&movq ($Dsse2,"mm3");
&cmp ("edi","eax");
#&movq ($Esse2,$E);
&movq ($Fsse2,"mm5");
&cmovb ("ebx","edi");
&movq ($Gsse2,"mm6");
&mov ("ecx",4); # loop counter
&pxor ($BxC,"mm1"); # magic
&movq ($Hsse2,"mm7");
&pxor ("mm3","mm3"); # magic
&jmp (&label("00_47_ssse3"));
sub BODY_00_15_ssse3 { # "phase-less" copy of BODY_00_15_sse2
(
'&movq ("mm1",$E)', # %mm1 is sliding right
'&movq ("mm7",&QWP(((-8*$i)%128)-128,$frame))',# X[i]+K[i]
'&pxor ("mm5","mm6")', # f^=g
'&psrlq ("mm1",14)',
'&movq (&QWP(8*($i+4)%64,"esp"),$E)', # modulo-scheduled save e
'&pand ("mm5",$E)', # f&=e
'&psllq ($E,23)', # $E is sliding left
'&paddq ($A,"mm3")', # [h+=Maj(a,b,c)]
'&movq ("mm3","mm1")', # %mm3 is T1
'&psrlq("mm1",4)',
'&pxor ("mm5","mm6")', # Ch(e,f,g)
'&pxor ("mm3",$E)',
'&psllq($E,23)',
'&pxor ("mm3","mm1")',
'&movq (&QWP(8*$i%64,"esp"),$A)', # modulo-scheduled save a
'&paddq("mm7","mm5")', # X[i]+=Ch(e,f,g)
'&pxor ("mm3",$E)',
'&psrlq("mm1",23)',
'&paddq("mm7",&QWP(8*($i+7)%64,"esp"))', # X[i]+=h
'&pxor ("mm3","mm1")',
'&psllq($E,4)',
'&pxor ("mm3",$E)', # T1=Sigma1_512(e)
'&movq ($E,&QWP(8*($i+3)%64,"esp"))', # e = load d, e in next round
'&paddq ("mm3","mm7")', # T1+=X[i]
'&movq ("mm5",$A)', # %mm5 is sliding right
'&psrlq("mm5",28)',
'&paddq ($E,"mm3")', # d += T1
'&movq ("mm6",$A)', # %mm6 is sliding left
'&movq ("mm7","mm5")',
'&psllq("mm6",25)',
'&movq ("mm1",&QWP(8*($i+1)%64,"esp"))', # load b
'&psrlq("mm5",6)',
'&pxor ("mm7","mm6")',
'&psllq("mm6",5)',
'&pxor ("mm7","mm5")',
'&pxor ($A,"mm1")', # a^b, b^c in next round
'&psrlq("mm5",5)',
'&pxor ("mm7","mm6")',
'&pand ($BxC,$A)', # (b^c)&(a^b)
'&psllq("mm6",6)',
'&pxor ("mm7","mm5")',
'&pxor ($BxC,"mm1")', # [h=]Maj(a,b,c)
'&pxor ("mm6","mm7")', # Sigma0_512(a)
'&movq ("mm5",&QWP(8*($i+5-1)%64,"esp"))', # pre-load f
'&paddq ($BxC,"mm6")', # h+=Sigma0(a)
'&movq ("mm6",&QWP(8*($i+6-1)%64,"esp"))', # pre-load g
'($A,$BxC) = ($BxC,$A); $i--;'
);
}
&set_label("00_47_ssse3",32);
for(;$j<16;$j++) {
my ($t0,$t2,$t1)=@X[2..4];
my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
&movdqa ($t2,@X[5]);
&movdqa (@X[1],$t0); # restore @X[1]
&palignr ($t0,@X[0],8); # X[1..2]
&movdqa (&QWP(16*($j%4),$frame),@X[4]); # off-load @X[4]
&palignr ($t2,@X[4],8); # X[9..10]
&movdqa ($t1,$t0);
&psrlq ($t0,7);
&paddq (@X[0],$t2); # X[0..1] += X[9..10]
&movdqa ($t2,$t1);
&psrlq ($t1,1);
&psllq ($t2,64-8);
&pxor ($t0,$t1);
&psrlq ($t1,8-1);
&pxor ($t0,$t2);
&psllq ($t2,8-1);
&pxor ($t0,$t1);
&movdqa ($t1,@X[7]);
&pxor ($t0,$t2); # sigma0(X[1..2])
&movdqa ($t2,@X[7]);
&psrlq ($t1,6);
&paddq (@X[0],$t0); # X[0..1] += sigma0(X[1..2])
&movdqa ($t0,@X[7]);
&psrlq ($t2,19);
&psllq ($t0,64-61);
&pxor ($t1,$t2);
&psrlq ($t2,61-19);
&pxor ($t1,$t0);
&psllq ($t0,61-19);
&pxor ($t1,$t2);
&movdqa ($t2,&QWP(16*(($j+2)%4),$frame));# pre-restore @X[1]
&pxor ($t1,$t0); # sigma0(X[1..2])
&movdqa ($t0,&QWP(16*($j%8),$K512));
eval(shift(@insns));
&paddq (@X[0],$t1); # X[0..1] += sigma0(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddq ($t0,@X[0]);
foreach(@insns) { eval; }
&movdqa (&QWP(16*($j%8)-128,$frame),$t0);# xfer X[i]+K[i]
push(@X,shift(@X)); # rotate(@X)
}
&lea ($K512,&DWP(16*8,$K512));
&dec ("ecx");
&jnz (&label("00_47_ssse3"));
&movdqa (@X[1],&QWP(0,$K512)); # byte swap mask
&lea ($K512,&DWP(-80*8,$K512)); # rewind
&movdqu (@X[0],&QWP(0,"ebx"));
&pshufb (@X[0],@X[1]);
for ($j=0;$j<8;$j++) { # load next or same block
my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
&movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load
&movdqa (@X[3],&QWP(16*($j%8),$K512));
&movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask
&movdqu (@X[1],&QWP(16*($j+1),"ebx")) if ($j<7); # next input
&movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0]
&paddq (@X[3],@X[0]);
&pshufb (@X[1],@X[2]) if ($j<7);
foreach(@insns) { eval; }
&movdqa (&QWP(16*($j%8)-128,$frame),@X[3]);# xfer X[i]+K[i]
push(@X,shift(@X)); # rotate(@X)
}
#&movq ($A,$Asse2); # load A-H
&movq ("mm1",$Bsse2);
&paddq ($A,"mm3"); # from BODY_00_15
#&movq ($BxC,$Csse2);
&movq ("mm3",$Dsse2);
#&movq ($E,$Esse2);
#&movq ("mm5",$Fsse2);
#&movq ("mm6",$Gsse2);
&movq ("mm7",$Hsse2);
&pxor ($BxC,"mm1"); # de-magic
&paddq ($A,&QWP(0,"esi"));
&paddq ("mm1",&QWP(8,"esi"));
&paddq ($BxC,&QWP(16,"esi"));
&paddq ("mm3",&QWP(24,"esi"));
&paddq ($E,&QWP(32,"esi"));
&paddq ("mm5",&QWP(40,"esi"));
&paddq ("mm6",&QWP(48,"esi"));
&paddq ("mm7",&QWP(56,"esi"));
&movq (&QWP(0,"esi"),$A);
&movq (&QWP(8,"esi"),"mm1");
&movq (&QWP(16,"esi"),$BxC);
&movq (&QWP(24,"esi"),"mm3");
&movq (&QWP(32,"esi"),$E);
&movq (&QWP(40,"esi"),"mm5");
&movq (&QWP(48,"esi"),"mm6");
&movq (&QWP(56,"esi"),"mm7");
&cmp ("edi","eax") # are we done yet?
&jb (&label("loop_ssse3"));
&mov ("esp",&DWP(64+12,$frame)); # restore sp
&emms ();
}
&function_end_A();
}
&set_label("loop_x86",16);
# copy input block to stack reversing byte and qword order
for ($i=0;$i<8;$i++) {
&mov ("eax",&DWP($i*16+0,"edi"));
&mov ("ebx",&DWP($i*16+4,"edi"));
&mov ("ecx",&DWP($i*16+8,"edi"));
&mov ("edx",&DWP($i*16+12,"edi"));
&bswap ("eax");
&bswap ("ebx");
&bswap ("ecx");
&bswap ("edx");
&push ("eax");
&push ("ebx");
&push ("ecx");
&push ("edx");
}
&add ("edi",128);
&sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
&mov (&DWP(8*(9+16)+4,"esp"),"edi");
# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
&lea ("edi",&DWP(8,"esp"));
&mov ("ecx",16);
&data_word(0xA5F3F689); # rep movsd
&set_label("00_15_x86",16);
&BODY_00_15_x86();
&cmp (&LB("edx"),0x94);
&jne (&label("00_15_x86"));
&set_label("16_79_x86",16);
#define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
# LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
# HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
&mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
&mov ("esi","ecx");
&shr ("ecx",1); # lo>>1
&mov ("edi","edx");
&shr ("edx",1); # hi>>1
&mov ("eax","ecx");
&shl ("esi",24); # lo<<24
&mov ("ebx","edx");
&shl ("edi",24); # hi<<24
&xor ("ebx","esi");
&shr ("ecx",7-1); # lo>>7
&xor ("eax","edi");
&shr ("edx",7-1); # hi>>7
&xor ("eax","ecx");
&shl ("esi",31-24); # lo<<31
&xor ("ebx","edx");
&shl ("edi",25-24); # hi<<25
&xor ("ebx","esi");
&shr ("ecx",8-7); # lo>>8
&xor ("eax","edi");
&shr ("edx",8-7); # hi>>8
&xor ("eax","ecx");
&shl ("edi",31-25); # hi<<31
&xor ("ebx","edx");
&xor ("eax","edi"); # T1 = sigma0(X[-15])
&mov (&DWP(0,"esp"),"eax");
&mov (&DWP(4,"esp"),"ebx"); # put T1 away
#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
# LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
# HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
&mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
&mov ("esi","ecx");
&shr ("ecx",6); # lo>>6
&mov ("edi","edx");
&shr ("edx",6); # hi>>6
&mov ("eax","ecx");
&shl ("esi",3); # lo<<3
&mov ("ebx","edx");
&shl ("edi",3); # hi<<3
&xor ("eax","esi");
&shr ("ecx",19-6); # lo>>19
&xor ("ebx","edi");
&shr ("edx",19-6); # hi>>19
&xor ("eax","ecx");
&shl ("esi",13-3); # lo<<13
&xor ("ebx","edx");
&shl ("edi",13-3); # hi<<13
&xor ("ebx","esi");
&shr ("ecx",29-19); # lo>>29
&xor ("eax","edi");
&shr ("edx",29-19); # hi>>29
&xor ("ebx","ecx");
&shl ("edi",26-13); # hi<<26
&xor ("eax","edx");
&xor ("eax","edi"); # sigma1(X[-2])
&mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
&add ("eax",&DWP(0,"esp"));
&adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
&mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
&mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += X[-16]
&add ("eax","esi");
&adc ("ebx","edi"); # T1 += X[-7]
&mov (&DWP(8*(9+15)+0,"esp"),"eax");
&mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
&BODY_00_15_x86();
&cmp (&LB("edx"),0x17);
&jne (&label("16_79_x86"));
&mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
&mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
for($i=0;$i<4;$i++) {
&mov ("eax",&DWP($i*16+0,"esi"));
&mov ("ebx",&DWP($i*16+4,"esi"));
&mov ("ecx",&DWP($i*16+8,"esi"));
&mov ("edx",&DWP($i*16+12,"esi"));
&add ("eax",&DWP(8+($i*16)+0,"esp"));
&adc ("ebx",&DWP(8+($i*16)+4,"esp"));
&mov (&DWP($i*16+0,"esi"),"eax");
&mov (&DWP($i*16+4,"esi"),"ebx");
&add ("ecx",&DWP(8+($i*16)+8,"esp"));
&adc ("edx",&DWP(8+($i*16)+12,"esp"));
&mov (&DWP($i*16+8,"esi"),"ecx");
&mov (&DWP($i*16+12,"esi"),"edx");
}
&add ("esp",8*(9+16+80)); # destroy frame
&sub ($K512,8*80); # rewind K
&cmp ("edi",&DWP(8,"esp")); # are we done yet?
&jb (&label("loop_x86"));
&mov ("esp",&DWP(12,"esp")); # restore sp
&function_end_A();
&set_label("K512",64); # Yes! I keep it in the code segment!
&data_word(0xd728ae22,0x428a2f98); # u64
&data_word(0x23ef65cd,0x71374491); # u64
&data_word(0xec4d3b2f,0xb5c0fbcf); # u64
&data_word(0x8189dbbc,0xe9b5dba5); # u64
&data_word(0xf348b538,0x3956c25b); # u64
&data_word(0xb605d019,0x59f111f1); # u64
&data_word(0xaf194f9b,0x923f82a4); # u64
&data_word(0xda6d8118,0xab1c5ed5); # u64
&data_word(0xa3030242,0xd807aa98); # u64
&data_word(0x45706fbe,0x12835b01); # u64
&data_word(0x4ee4b28c,0x243185be); # u64
&data_word(0xd5ffb4e2,0x550c7dc3); # u64
&data_word(0xf27b896f,0x72be5d74); # u64
&data_word(0x3b1696b1,0x80deb1fe); # u64
&data_word(0x25c71235,0x9bdc06a7); # u64
&data_word(0xcf692694,0xc19bf174); # u64
&data_word(0x9ef14ad2,0xe49b69c1); # u64
&data_word(0x384f25e3,0xefbe4786); # u64
&data_word(0x8b8cd5b5,0x0fc19dc6); # u64
&data_word(0x77ac9c65,0x240ca1cc); # u64
&data_word(0x592b0275,0x2de92c6f); # u64
&data_word(0x6ea6e483,0x4a7484aa); # u64
&data_word(0xbd41fbd4,0x5cb0a9dc); # u64
&data_word(0x831153b5,0x76f988da); # u64
&data_word(0xee66dfab,0x983e5152); # u64
&data_word(0x2db43210,0xa831c66d); # u64
&data_word(0x98fb213f,0xb00327c8); # u64
&data_word(0xbeef0ee4,0xbf597fc7); # u64
&data_word(0x3da88fc2,0xc6e00bf3); # u64
&data_word(0x930aa725,0xd5a79147); # u64
&data_word(0xe003826f,0x06ca6351); # u64
&data_word(0x0a0e6e70,0x14292967); # u64
&data_word(0x46d22ffc,0x27b70a85); # u64
&data_word(0x5c26c926,0x2e1b2138); # u64
&data_word(0x5ac42aed,0x4d2c6dfc); # u64
&data_word(0x9d95b3df,0x53380d13); # u64
&data_word(0x8baf63de,0x650a7354); # u64
&data_word(0x3c77b2a8,0x766a0abb); # u64
&data_word(0x47edaee6,0x81c2c92e); # u64
&data_word(0x1482353b,0x92722c85); # u64
&data_word(0x4cf10364,0xa2bfe8a1); # u64
&data_word(0xbc423001,0xa81a664b); # u64
&data_word(0xd0f89791,0xc24b8b70); # u64
&data_word(0x0654be30,0xc76c51a3); # u64
&data_word(0xd6ef5218,0xd192e819); # u64
&data_word(0x5565a910,0xd6990624); # u64
&data_word(0x5771202a,0xf40e3585); # u64
&data_word(0x32bbd1b8,0x106aa070); # u64
&data_word(0xb8d2d0c8,0x19a4c116); # u64
&data_word(0x5141ab53,0x1e376c08); # u64
&data_word(0xdf8eeb99,0x2748774c); # u64
&data_word(0xe19b48a8,0x34b0bcb5); # u64
&data_word(0xc5c95a63,0x391c0cb3); # u64
&data_word(0xe3418acb,0x4ed8aa4a); # u64
&data_word(0x7763e373,0x5b9cca4f); # u64
&data_word(0xd6b2b8a3,0x682e6ff3); # u64
&data_word(0x5defb2fc,0x748f82ee); # u64
&data_word(0x43172f60,0x78a5636f); # u64
&data_word(0xa1f0ab72,0x84c87814); # u64
&data_word(0x1a6439ec,0x8cc70208); # u64
&data_word(0x23631e28,0x90befffa); # u64
&data_word(0xde82bde9,0xa4506ceb); # u64
&data_word(0xb2c67915,0xbef9a3f7); # u64
&data_word(0xe372532b,0xc67178f2); # u64
&data_word(0xea26619c,0xca273ece); # u64
&data_word(0x21c0c207,0xd186b8c7); # u64
&data_word(0xcde0eb1e,0xeada7dd6); # u64
&data_word(0xee6ed178,0xf57d4f7f); # u64
&data_word(0x72176fba,0x06f067aa); # u64
&data_word(0xa2c898a6,0x0a637dc5); # u64
&data_word(0xbef90dae,0x113f9804); # u64
&data_word(0x131c471b,0x1b710b35); # u64
&data_word(0x23047d84,0x28db77f5); # u64
&data_word(0x40c72493,0x32caab7b); # u64
&data_word(0x15c9bebc,0x3c9ebe0a); # u64
&data_word(0x9c100d4c,0x431d67c4); # u64
&data_word(0xcb3e42b6,0x4cc5d4be); # u64
&data_word(0xfc657e2a,0x597f299c); # u64
&data_word(0x3ad6faec,0x5fcb6fab); # u64
&data_word(0x4a475817,0x6c44198c); # u64
&data_word(0x04050607,0x00010203); # byte swap
&data_word(0x0c0d0e0f,0x08090a0b); # mask
&function_end_B("sha512_block_data_order");
&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -0,0 +1,609 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
# Xscale PXA250 core].
#
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
# one byte in 23.3 cycles or ~60% faster than integer-only code.
# August 2012.
#
# Improve NEON performance by 12% on Snapdragon S4. In absolute
# terms it's 22.6 cycles per byte, which is disappointing result.
# Technical writers asserted that 3-way S4 pipeline can sustain
# multiple NEON instructions per cycle, but dual NEON issue could
# not be observed, and for NEON-only sequences IPC(*) was found to
# be limited by 1:-( 0.33 and 0.66 were measured for sequences with
# ILPs(*) of 1 and 2 respectively. This in turn means that you can
# even find yourself striving, as I did here, for achieving IPC
# adequate to one delivered by Cortex A8 [for reference, it's
# 0.5 for ILP of 1, and 1 for higher ILPs].
#
# (*) ILP, instruction-level parallelism, how many instructions
# *can* execute at the same time. IPC, instructions per cycle,
# indicates how many instructions actually execute.
# Byte order [in]dependence. =========================================
#
# Originally caller was expected to maintain specific *dword* order in
# h[0-7], namely with most significant dword at *lower* address, which
# was reflected in below two parameters as 0 and 4. Now caller is
# expected to maintain native byte order for whole 64-bit values.
$hi="HI";
$lo="LO";
# ====================================================================
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0"; # parameter block
$inp="r1";
$len="r2";
$Tlo="r3";
$Thi="r4";
$Alo="r5";
$Ahi="r6";
$Elo="r7";
$Ehi="r8";
$t0="r9";
$t1="r10";
$t2="r11";
$t3="r12";
############ r13 is stack pointer
$Ktbl="r14";
############ r15 is program counter
$Aoff=8*0;
$Boff=8*1;
$Coff=8*2;
$Doff=8*3;
$Eoff=8*4;
$Foff=8*5;
$Goff=8*6;
$Hoff=8*7;
$Xoff=8*8;
sub BODY_00_15() {
my $magic = shift;
$code.=<<___;
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14
str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14
str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18
ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18
ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14
eor $t1,$t1,$Elo,lsl#14
eor $t0,$t0,$Ehi,lsr#9
eor $t1,$t1,$Elo,lsr#9
eor $t0,$t0,$Elo,lsl#23
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#$Foff+0] @ f.lo
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
ldr $t1,[sp,#$Foff+4] @ f.hi
adds $Tlo,$Tlo,$t2
ldr $t2,[sp,#$Goff+0] @ g.lo
adc $Thi,$Thi,$t3 @ T += h
ldr $t3,[sp,#$Goff+4] @ g.hi
eor $t0,$t0,$t2
str $Elo,[sp,#$Eoff+0]
eor $t1,$t1,$t3
str $Ehi,[sp,#$Eoff+4]
and $t0,$t0,$Elo
str $Alo,[sp,#$Aoff+0]
and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo
ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
mov $t0,$Alo,lsr#28
mov $t1,$Ahi,lsr#28
eor $t0,$t0,$Ahi,lsl#4
eor $t1,$t1,$Alo,lsl#4
eor $t0,$t0,$Ahi,lsr#2
eor $t1,$t1,$Alo,lsr#2
eor $t0,$t0,$Alo,lsl#30
eor $t1,$t1,$Ahi,lsl#30
eor $t0,$t0,$Ahi,lsr#7
eor $t1,$t1,$Alo,lsr#7
eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0
and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
ldr $t1,[sp,#$Boff+4] @ b.hi
orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3
and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2
adds $Alo,$Alo,$Tlo
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8
adc $Ahi,$Ahi,$Thi @ h += T
tst $Ktbl,#1
add $Ktbl,$Ktbl,#8
___
}
$code=<<___;
#include "arm_arch.h"
#ifdef __ARMEL__
# define LO 0
# define HI 4
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif
.text
.code 32
.type K512,%object
.align 5
K512:
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha512_block_data_order
.skip 32-4
#else
.skip 32
#endif
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
#if __ARM_MAX_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#1
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
ldr $Ehi,[$ctx,#$Eoff+$hi]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
.Loop:
str $t0, [sp,#$Goff+0]
str $t1, [sp,#$Goff+4]
str $t2, [sp,#$Hoff+0]
str $t3, [sp,#$Hoff+4]
ldr $Alo,[$ctx,#$Aoff+$lo]
ldr $Ahi,[$ctx,#$Aoff+$hi]
ldr $Tlo,[$ctx,#$Boff+$lo]
ldr $Thi,[$ctx,#$Boff+$hi]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
str $Tlo,[sp,#$Boff+0]
str $Thi,[sp,#$Boff+4]
str $t0, [sp,#$Coff+0]
str $t1, [sp,#$Coff+4]
str $t2, [sp,#$Doff+0]
str $t3, [sp,#$Doff+4]
ldr $Tlo,[$ctx,#$Foff+$lo]
ldr $Thi,[$ctx,#$Foff+$hi]
str $Tlo,[sp,#$Foff+0]
str $Thi,[sp,#$Foff+4]
.L00_15:
#if __ARM_ARCH__<7
ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5]
ldrb $t2, [$inp,#4]
ldrb $Thi,[$inp,#3]
ldrb $t3, [$inp,#2]
orr $Tlo,$Tlo,$t0,lsl#8
ldrb $t0, [$inp,#1]
orr $Tlo,$Tlo,$t1,lsl#16
ldrb $t1, [$inp],#8
orr $Tlo,$Tlo,$t2,lsl#24
orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24
#else
ldr $Tlo,[$inp,#4]
ldr $Thi,[$inp],#8
#ifdef __ARMEL__
rev $Tlo,$Tlo
rev $Thi,$Thi
#endif
#endif
___
&BODY_00_15(0x94);
$code.=<<___;
tst $Ktbl,#1
beq .L00_15
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
bic $Ktbl,$Ktbl,#1
.L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8
eor $Thi,$Thi,$t1,lsr#8
eor $Tlo,$Tlo,$t1,lsl#24
eor $Thi,$Thi,$t0,lsl#24
eor $Tlo,$Tlo,$t0,lsr#7
eor $Thi,$Thi,$t1,lsr#7
eor $Tlo,$Tlo,$t1,lsl#25
@ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
@ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
@ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
mov $t0,$t2,lsr#19
mov $t1,$t3,lsr#19
eor $t0,$t0,$t3,lsl#13
eor $t1,$t1,$t2,lsl#13
eor $t0,$t0,$t3,lsr#29
eor $t1,$t1,$t2,lsr#29
eor $t0,$t0,$t2,lsl#3
eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1
ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1
___
&BODY_00_15(0x17);
$code.=<<___;
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
bic $Ktbl,$Ktbl,#1
ldr $Tlo,[sp,#$Boff+0]
ldr $Thi,[sp,#$Boff+4]
ldr $t0, [$ctx,#$Aoff+$lo]
ldr $t1, [$ctx,#$Aoff+$hi]
ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Aoff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0]
ldr $Ahi,[sp,#$Coff+4]
ldr $Tlo,[sp,#$Doff+0]
ldr $Thi,[sp,#$Doff+4]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Coff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0]
ldr $Thi,[sp,#$Foff+4]
ldr $t0, [$ctx,#$Eoff+$lo]
ldr $t1, [$ctx,#$Eoff+$hi]
ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0
str $Elo,[$ctx,#$Eoff+$lo]
adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0]
ldr $Ahi,[sp,#$Goff+4]
ldr $Tlo,[sp,#$Hoff+0]
ldr $Thi,[sp,#$Hoff+4]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Goff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640
sub $Ktbl,$Ktbl,#640
teq $inp,$len
bne .Loop
add sp,sp,#8*9 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
___
{
my @Sigma0=(28,34,39);
my @Sigma1=(14,18,41);
my @sigma0=(1, 8, 7);
my @sigma1=(19,61,6);
my $Ktbl="r3";
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
my @X=map("d$_",(0..15));
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
sub NEON_00_15() {
my $i=shift;
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
$code.=<<___ if ($i<16 || $i&1);
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
#if $i<16
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
#if $i>0
vadd.i64 $a,$Maj @ h+=Maj from the past
#endif
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
vmov $Ch,$e
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
veor $t1,$t0
vbsl $Ch,$f,$g @ Ch(e,f,g)
vshr.u64 $t0,$a,#@Sigma0[0]
veor $t2,$t1 @ Sigma1(e)
vadd.i64 $T1,$Ch,$h
vshr.u64 $t1,$a,#@Sigma0[1]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
vadd.i64 $T1,$t2
vshr.u64 $t2,$a,#@Sigma0[2]
vadd.i64 $K,@X[$i%16]
vsli.64 $t1,$a,#`64-@Sigma0[1]`
veor $Maj,$a,$b
vsli.64 $t2,$a,#`64-@Sigma0[2]`
veor $h,$t0,$t1
vadd.i64 $T1,$K
vbsl $Maj,$c,$b @ Maj(a,b,c)
veor $h,$t2 @ Sigma0(a)
vadd.i64 $d,$T1
vadd.i64 $Maj,$T1
@ vadd.i64 $h,$Maj
___
}
sub NEON_16_79() {
my $i=shift;
if ($i&1) { &NEON_00_15($i,@_); return; }
# 2x-vectorized, therefore runs every 2nd round
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
my $e=@_[4]; # $e from NEON_00_15
$i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
vadd.i64 @_[0],d30 @ h+=Maj from the past
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
veor $s1,$t0
vshr.u64 $t0,$s0,#@sigma0[0]
veor $s1,$t1 @ sigma1(X[i+14])
vshr.u64 $t1,$s0,#@sigma0[1]
vadd.i64 @X[$i%8],$s1
vshr.u64 $s1,$s0,#@sigma0[2]
vsli.64 $t0,$s0,#`64-@sigma0[0]`
vsli.64 $t1,$s0,#`64-@sigma0[1]`
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
veor $s1,$t0
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
vadd.i64 @X[$i%8],$s0
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
veor $s1,$t1 @ sigma0(X[i+1])
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
vadd.i64 @X[$i%8],$s1
___
&NEON_00_15(2*$i,@_);
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.align 4
.LNEON:
dmb @ errata #451034 on early Cortex A8
vstmdb sp!,{d8-d15} @ ABI specification says so
sub $Ktbl,r3,#672 @ K512
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vadd.i64 $A,d30 @ h+=Maj from the past
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
vldmia sp!,{d8-d15} @ epilogue
ret @ bx lr
#endif
___
}
$code.=<<___;
.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT; # enforce flush

View File

@@ -0,0 +1,422 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA256/512 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# SHA256-hw SHA256(*) SHA512
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
# 10% (or by 1 cycle per round), but at the cost of 20% loss
# on Cortex-A53 (or by 4 cycles per round).
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
# generated with -mgeneral-regs-only is significanty faster
# and the gap is only 40-90%.
$flavour=shift;
$output=shift;
open STDOUT,">$output";
if ($output =~ /512/) {
$BITS=512;
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$reg_t="x";
} else {
$BITS=256;
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$reg_t="w";
}
$func="sha${BITS}_block_data_order";
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
@X=map("$reg_t$_",(3..15,0..2));
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
sub BODY_00_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)&15;
my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
$T0=@X[$i+3] if ($i<11);
$code.=<<___ if ($i<16);
#ifndef __ARMEB__
rev @X[$i],@X[$i] // $i
#endif
___
$code.=<<___ if ($i<13 && ($i&1));
ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
___
$code.=<<___ if ($i==13);
ldp @X[14],@X[15],[$inp]
___
$code.=<<___ if ($i>=14);
ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
___
$code.=<<___ if ($i>0 && $i<16);
add $a,$a,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=11);
str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
___
# While ARMv8 specifies merged rotate-n-logical operation such as
# 'eor x,y,z,ror#n', it was found to negatively affect performance
# on Apple A7. The reason seems to be that it requires even 'y' to
# be available earlier. This means that such merged instruction is
# not necessarily best choice on critical path... On the other hand
# Cortex-A5x handles merged instructions much better than disjoint
# rotate and logical... See (**) footnote above.
$code.=<<___ if ($i<15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
and $t1,$f,$e
bic $t2,$g,$e
add $h,$h,@X[$i&15] // h+=X[i]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
ror $T0,$a,#$Sigma0[0]
add $h,$h,$t1 // h+=Ch(e,f,g)
eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
add $h,$h,$t0 // h+=Sigma1(e)
and $t3,$t3,$t2 // (b^c)&=(a^b)
add $d,$d,$h // d+=h
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
//add $h,$h,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
ror $T1,@X[($j+1)&15],#$sigma0[0]
and $t1,$f,$e
ror $T2,@X[($j+14)&15],#$sigma1[0]
bic $t2,$g,$e
ror $T0,$a,#$Sigma0[0]
add $h,$h,@X[$i&15] // h+=X[i]
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
eor $T0,$T0,$a,ror#$Sigma0[1]
add $h,$h,$t1 // h+=Ch(e,f,g)
and $t3,$t3,$t2 // (b^c)&=(a^b)
eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
add $h,$h,$t0 // h+=Sigma1(e)
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
add @X[$j],@X[$j],@X[($j+9)&15]
add $d,$d,$h // d+=h
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
add @X[$j],@X[$j],$T1
add $h,$h,$t1 // h+=Sigma0(a)
add @X[$j],@X[$j],$T2
___
($t2,$t3)=($t3,$t2);
}
$code.=<<___;
#include "arm_arch.h"
.text
.globl $func
.type $func,%function
.align 6
$func:
___
$code.=<<___ if ($SZ==4);
ldr x16,.LOPENSSL_armcap_P
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
tst w16,#ARMV8_SHA256
b.ne .Lv8_entry
___
$code.=<<___;
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#4*$SZ
ldp $A,$B,[$ctx] // load context
ldp $C,$D,[$ctx,#2*$SZ]
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
adr $Ktbl,K$BITS
stp $ctx,$num,[x29,#96]
.Loop:
ldp @X[0],@X[1],[$inp],#2*$SZ
ldr $t2,[$Ktbl],#$SZ // *K++
eor $t3,$B,$C // magic seed
str $inp,[x29,#112]
___
for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=".Loop_16_xx:\n";
for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
cbnz $t2,.Loop_16_xx
ldp $ctx,$num,[x29,#96]
ldr $inp,[x29,#112]
sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
ldp @X[0],@X[1],[$ctx]
ldp @X[2],@X[3],[$ctx,#2*$SZ]
add $inp,$inp,#14*$SZ // advance input pointer
ldp @X[4],@X[5],[$ctx,#4*$SZ]
add $A,$A,@X[0]
ldp @X[6],@X[7],[$ctx,#6*$SZ]
add $B,$B,@X[1]
add $C,$C,@X[2]
add $D,$D,@X[3]
stp $A,$B,[$ctx]
add $E,$E,@X[4]
add $F,$F,@X[5]
stp $C,$D,[$ctx,#2*$SZ]
add $G,$G,@X[6]
add $H,$H,@X[7]
cmp $inp,$num
stp $E,$F,[$ctx,#4*$SZ]
stp $G,$H,[$ctx,#6*$SZ]
b.ne .Loop
ldp x19,x20,[x29,#16]
add sp,sp,#4*$SZ
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
ret
.size $func,.-$func
.align 6
.type K$BITS,%object
K$BITS:
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0 //terminator
___
$code.=<<___;
.size K$BITS,.-K$BITS
.align 3
.LOPENSSL_armcap_P:
.quad OPENSSL_armcap_P-.
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
if ($SZ==4) {
my $Ktbl="x3";
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
my @MSG=map("v$_.16b",(4..7));
my ($W0,$W1)=("v16.4s","v17.4s");
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
$code.=<<___;
.type sha256_block_armv8,%function
.align 6
sha256_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
adr $Ktbl,K256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
ld1.32 {$W0},[$Ktbl],#16
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
rev32 @MSG[2],@MSG[2]
rev32 @MSG[3],@MSG[3]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
orr $EFGH_SAVE,$EFGH,$EFGH
___
for($i=0;$i<12;$i++) {
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
ld1.32 {$W0},[$Ktbl],#16
add.i32 $W1,$W1,@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
ld1.32 {$W1},[$Ktbl]
add.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
add.i32 $W1,$W1,@MSG[3]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
add.i32 $ABCD,$ABCD,$ABCD_SAVE
add.i32 $EFGH,$EFGH,$EFGH_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD,$EFGH},[$ctx]
ldr x29,[sp],#16
ret
.size sha256_block_armv8,.-sha256_block_armv8
___
}
$code.=<<___;
.comm OPENSSL_armcap_P,4,4
___
{ my %opcode = (
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
s/\.\w?32\b//o and s/\.16b/\.4s/go;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,685 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA256/512_Transform for Itanium.
#
# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%
# faster than gcc and >60%(!) faster than code generated by HP-UX
# compiler (yes, HP-UX is generating slower code, because unlike gcc,
# it failed to deploy "shift right pair," 'shrp' instruction, which
# substitutes for 64-bit rotate).
#
# 924 cycles long sha256_block outperforms gcc by over factor of 2(!)
# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost
# this one big time). Note that "formally" 924 is about 100 cycles
# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical
# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round,
# are spent on extra work to provide for 32-bit rotations. 32-bit
# rotations are still handled by 'shrp' instruction and for this
# reason lower 32 bits are deposited to upper half of 64-bit register
# prior 'shrp' issue. And in order to minimize the amount of such
# operations, X[16] values are *maintained* with copies of lower
# halves in upper halves, which is why you'll spot such instructions
# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel
# 32-bit unsigned right shift," 'pshr4.u' instructions here.
#
# Rules of engagement.
#
# There is only one integer shifter meaning that if I have two rotate,
# deposit or extract instructions in adjacent bundles, they shall
# split [at run-time if they have to]. But note that variable and
# parallel shifts are performed by multi-media ALU and *are* pairable
# with rotates [and alike]. On the backside MMALU is rather slow: it
# takes 2 extra cycles before the result of integer operation is
# available *to* MMALU and 2(*) extra cycles before the result of MM
# operation is available "back" *to* integer ALU, not to mention that
# MMALU itself has 2 cycles latency. However! I explicitly scheduled
# these MM instructions to avoid MM stalls, so that all these extra
# latencies get "hidden" in instruction-level parallelism.
#
# (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule
# for 2 in order to provide for best *overall* performance,
# because on Itanium 1 stall on MM result is accompanied by
# pipeline flush, which takes 6 cycles:-(
#
# June 2012
#
# Improve performance by 15-20%. Note about "rules of engagement"
# above. Contemporary cores are equipped with additional shifter,
# so that they should perform even better than below, presumably
# by ~10%.
#
######################################################################
# Current performance in cycles per processed byte for Itanium 2
# pre-9000 series [little-endian] system:
#
# SHA1(*) 5.7
# SHA256 12.6
# SHA512 6.7
#
# (*) SHA1 result is presented purely for reference purposes.
#
# To generate code, pass the file name with either 256 or 512 in its
# name and compiler flags.
$output=shift;
if ($output =~ /512.*\.[s|asm]/) {
$SZ=8;
$BITS=8*$SZ;
$LDW="ld8";
$STW="st8";
$ADD="add";
$SHRU="shr.u";
$TABLE="K512";
$func="sha512_block_data_order";
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
} elsif ($output =~ /256.*\.[s|asm]/) {
$SZ=4;
$BITS=8*$SZ;
$LDW="ld4";
$STW="st4";
$ADD="padd4";
$SHRU="pshr4.u";
$TABLE="K256";
$func="sha256_block_data_order";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
} else { die "nonsense $output"; }
open STDOUT,">$output" || die "can't open $output: $!";
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
$big_endian=0 if (/\-DL_ENDIAN/); }
if (!defined($big_endian))
{ $big_endian=(unpack('L',pack('N',1))==1); }
$code=<<___;
.ident \"$output, version 2.0\"
.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@openssl.org>\"
.explicit
.text
pfssave=r2;
lcsave=r3;
prsave=r14;
K=r15;
A_=r16; B_=r17; C_=r18; D_=r19;
E_=r20; F_=r21; G_=r22; H_=r23;
T1=r24; T2=r25;
s0=r26; s1=r27; t0=r28; t1=r29;
Ktbl=r30;
ctx=r31; // 1st arg
input=r56; // 2nd arg
num=r57; // 3rd arg
sgm0=r58; sgm1=r59; // small constants
// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
.global $func#
.proc $func#
.align 32
.skip 16
$func:
.prologue
.save ar.pfs,pfssave
{ .mmi; alloc pfssave=ar.pfs,3,25,0,24
$ADDP ctx=0,r32 // 1st arg
.save ar.lc,lcsave
mov lcsave=ar.lc }
{ .mmi; $ADDP input=0,r33 // 2nd arg
mov num=r34 // 3rd arg
.save pr,prsave
mov prsave=pr };;
.body
{ .mib; add r8=0*$SZ,ctx
add r9=1*$SZ,ctx }
{ .mib; add r10=2*$SZ,ctx
add r11=3*$SZ,ctx };;
// load A-H
.Lpic_point:
{ .mmi; $LDW A_=[r8],4*$SZ
$LDW B_=[r9],4*$SZ
mov Ktbl=ip }
{ .mmi; $LDW C_=[r10],4*$SZ
$LDW D_=[r11],4*$SZ
mov sgm0=$sigma0[2] };;
{ .mmi; $LDW E_=[r8]
$LDW F_=[r9]
add Ktbl=($TABLE#-.Lpic_point),Ktbl }
{ .mmi; $LDW G_=[r10]
$LDW H_=[r11]
cmp.ne p0,p16=0,r0 };;
___
$code.=<<___ if ($BITS==64);
{ .mii; and r8=7,input
and input=~7,input;;
cmp.eq p9,p0=1,r8 }
{ .mmi; cmp.eq p10,p0=2,r8
cmp.eq p11,p0=3,r8
cmp.eq p12,p0=4,r8 }
{ .mmi; cmp.eq p13,p0=5,r8
cmp.eq p14,p0=6,r8
cmp.eq p15,p0=7,r8 };;
___
$code.=<<___;
.L_outer:
.rotr R[8],X[16]
A=R[0]; B=R[1]; C=R[2]; D=R[3]; E=R[4]; F=R[5]; G=R[6]; H=R[7]
{ .mmi; ld1 X[15]=[input],$SZ // eliminated in sha512
mov A=A_
mov ar.lc=14 }
{ .mmi; mov B=B_
mov C=C_
mov D=D_ }
{ .mmi; mov E=E_
mov F=F_
mov ar.ec=2 };;
{ .mmi; mov G=G_
mov H=H_
mov sgm1=$sigma1[2] }
{ .mib; mov r8=0
add r9=1-$SZ,input
brp.loop.imp .L_first16,.L_first16_end-16 };;
___
$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
// in sha512 case I load whole X[16] at once and take care of alignment...
{ .mmi; add r8=1*$SZ,input
add r9=2*$SZ,input
add r10=3*$SZ,input };;
{ .mmb; $LDW X[15]=[input],4*$SZ
$LDW X[14]=[r8],4*$SZ
(p9) br.cond.dpnt.many .L1byte };;
{ .mmb; $LDW X[13]=[r9],4*$SZ
$LDW X[12]=[r10],4*$SZ
(p10) br.cond.dpnt.many .L2byte };;
{ .mmb; $LDW X[11]=[input],4*$SZ
$LDW X[10]=[r8],4*$SZ
(p11) br.cond.dpnt.many .L3byte };;
{ .mmb; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
(p12) br.cond.dpnt.many .L4byte };;
{ .mmb; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
(p13) br.cond.dpnt.many .L5byte };;
{ .mmb; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
(p14) br.cond.dpnt.many .L6byte };;
{ .mmb; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
(p15) br.cond.dpnt.many .L7byte };;
{ .mmb; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L1byte:
{ .mmi; $LDW X[13]=[r9],4*$SZ
$LDW X[12]=[r10],4*$SZ
shrp X[15]=X[15],X[14],56 };;
{ .mmi; $LDW X[11]=[input],4*$SZ
$LDW X[10]=[r8],4*$SZ
shrp X[14]=X[14],X[13],56 }
{ .mmi; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
shrp X[13]=X[13],X[12],56 };;
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[12]=X[12],X[11],56 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[11]=X[11],X[10],56 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[10]=X[10],X[ 9],56 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[ 9]=X[ 9],X[ 8],56 };;
{ .mii; $LDW T1=[input]
shrp X[ 8]=X[ 8],X[ 7],56
shrp X[ 7]=X[ 7],X[ 6],56 }
{ .mii; shrp X[ 6]=X[ 6],X[ 5],56
shrp X[ 5]=X[ 5],X[ 4],56 };;
{ .mii; shrp X[ 4]=X[ 4],X[ 3],56
shrp X[ 3]=X[ 3],X[ 2],56 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],56
shrp X[ 1]=X[ 1],X[ 0],56 }
{ .mib; shrp X[ 0]=X[ 0],T1,56 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L2byte:
{ .mmi; $LDW X[11]=[input],4*$SZ
$LDW X[10]=[r8],4*$SZ
shrp X[15]=X[15],X[14],48 }
{ .mmi; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
shrp X[14]=X[14],X[13],48 };;
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[13]=X[13],X[12],48 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[12]=X[12],X[11],48 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[11]=X[11],X[10],48 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[10]=X[10],X[ 9],48 };;
{ .mii; $LDW T1=[input]
shrp X[ 9]=X[ 9],X[ 8],48
shrp X[ 8]=X[ 8],X[ 7],48 }
{ .mii; shrp X[ 7]=X[ 7],X[ 6],48
shrp X[ 6]=X[ 6],X[ 5],48 };;
{ .mii; shrp X[ 5]=X[ 5],X[ 4],48
shrp X[ 4]=X[ 4],X[ 3],48 }
{ .mii; shrp X[ 3]=X[ 3],X[ 2],48
shrp X[ 2]=X[ 2],X[ 1],48 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],48
shrp X[ 0]=X[ 0],T1,48 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L3byte:
{ .mmi; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
shrp X[15]=X[15],X[14],40 };;
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[14]=X[14],X[13],40 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[13]=X[13],X[12],40 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[12]=X[12],X[11],40 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[11]=X[11],X[10],40 };;
{ .mii; $LDW T1=[input]
shrp X[10]=X[10],X[ 9],40
shrp X[ 9]=X[ 9],X[ 8],40 }
{ .mii; shrp X[ 8]=X[ 8],X[ 7],40
shrp X[ 7]=X[ 7],X[ 6],40 };;
{ .mii; shrp X[ 6]=X[ 6],X[ 5],40
shrp X[ 5]=X[ 5],X[ 4],40 }
{ .mii; shrp X[ 4]=X[ 4],X[ 3],40
shrp X[ 3]=X[ 3],X[ 2],40 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],40
shrp X[ 1]=X[ 1],X[ 0],40 }
{ .mib; shrp X[ 0]=X[ 0],T1,40 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L4byte:
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[15]=X[15],X[14],32 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[14]=X[14],X[13],32 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[13]=X[13],X[12],32 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[12]=X[12],X[11],32 };;
{ .mii; $LDW T1=[input]
shrp X[11]=X[11],X[10],32
shrp X[10]=X[10],X[ 9],32 }
{ .mii; shrp X[ 9]=X[ 9],X[ 8],32
shrp X[ 8]=X[ 8],X[ 7],32 };;
{ .mii; shrp X[ 7]=X[ 7],X[ 6],32
shrp X[ 6]=X[ 6],X[ 5],32 }
{ .mii; shrp X[ 5]=X[ 5],X[ 4],32
shrp X[ 4]=X[ 4],X[ 3],32 }
{ .mii; shrp X[ 3]=X[ 3],X[ 2],32
shrp X[ 2]=X[ 2],X[ 1],32 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],32
shrp X[ 0]=X[ 0],T1,32 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L5byte:
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[15]=X[15],X[14],24 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[14]=X[14],X[13],24 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[13]=X[13],X[12],24 };;
{ .mii; $LDW T1=[input]
shrp X[12]=X[12],X[11],24
shrp X[11]=X[11],X[10],24 }
{ .mii; shrp X[10]=X[10],X[ 9],24
shrp X[ 9]=X[ 9],X[ 8],24 };;
{ .mii; shrp X[ 8]=X[ 8],X[ 7],24
shrp X[ 7]=X[ 7],X[ 6],24 }
{ .mii; shrp X[ 6]=X[ 6],X[ 5],24
shrp X[ 5]=X[ 5],X[ 4],24 }
{ .mii; shrp X[ 4]=X[ 4],X[ 3],24
shrp X[ 3]=X[ 3],X[ 2],24 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],24
shrp X[ 1]=X[ 1],X[ 0],24 }
{ .mib; shrp X[ 0]=X[ 0],T1,24 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L6byte:
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[15]=X[15],X[14],16 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[14]=X[14],X[13],16 };;
{ .mii; $LDW T1=[input]
shrp X[13]=X[13],X[12],16
shrp X[12]=X[12],X[11],16 }
{ .mii; shrp X[11]=X[11],X[10],16
shrp X[10]=X[10],X[ 9],16 };;
{ .mii; shrp X[ 9]=X[ 9],X[ 8],16
shrp X[ 8]=X[ 8],X[ 7],16 }
{ .mii; shrp X[ 7]=X[ 7],X[ 6],16
shrp X[ 6]=X[ 6],X[ 5],16 }
{ .mii; shrp X[ 5]=X[ 5],X[ 4],16
shrp X[ 4]=X[ 4],X[ 3],16 }
{ .mii; shrp X[ 3]=X[ 3],X[ 2],16
shrp X[ 2]=X[ 2],X[ 1],16 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],16
shrp X[ 0]=X[ 0],T1,16 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L7byte:
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[15]=X[15],X[14],8 };;
{ .mii; $LDW T1=[input]
shrp X[14]=X[14],X[13],8
shrp X[13]=X[13],X[12],8 }
{ .mii; shrp X[12]=X[12],X[11],8
shrp X[11]=X[11],X[10],8 };;
{ .mii; shrp X[10]=X[10],X[ 9],8
shrp X[ 9]=X[ 9],X[ 8],8 }
{ .mii; shrp X[ 8]=X[ 8],X[ 7],8
shrp X[ 7]=X[ 7],X[ 6],8 }
{ .mii; shrp X[ 6]=X[ 6],X[ 5],8
shrp X[ 5]=X[ 5],X[ 4],8 }
{ .mii; shrp X[ 4]=X[ 4],X[ 3],8
shrp X[ 3]=X[ 3],X[ 2],8 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],8
shrp X[ 1]=X[ 1],X[ 0],8 }
{ .mib; shrp X[ 0]=X[ 0],T1,8 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev };; // eliminated on big-endian
.align 32
.L_first16:
{ .mmi; $LDW K=[Ktbl],$SZ
add A=A,r8 // H+=Sigma(0) from the past
_rotr r10=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mmi; and T1=F,E
andcm r8=G,E
(p16) mux1 X[14]=X[14],\@rev };; // eliminated on big-endian
{ .mmi; and T2=A,B
and r9=A,C
_rotr r11=$t1,$Sigma1[1] } // ROTR(e,41)
{ .mmi; xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
and r8=B,C };;
___
$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
.align 32
.L_first16:
{ .mmi; add A=A,r8 // H+=Sigma(0) from the past
add r10=2-$SZ,input
add r11=3-$SZ,input };;
{ .mmi; ld1 r9=[r9]
ld1 r10=[r10]
dep.z $t1=E,32,32 }
{ .mmi; ld1 r11=[r11]
$LDW K=[Ktbl],$SZ
zxt4 E=E };;
{ .mii; or $t1=$t1,E
dep X[15]=X[15],r9,8,8
mux2 $t0=A,0x44 };; // copy lower half to upper
{ .mmi; and T1=F,E
andcm r8=G,E
dep r11=r10,r11,8,8 };;
{ .mmi; and T2=A,B
and r9=A,C
dep X[15]=X[15],r11,16,16 };;
{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
_rotr r10=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mmi; and r8=B,C
_rotr r11=$t1,$Sigma1[1] };; // ROTR(e,18)
___
$code.=<<___;
{ .mmi; add T1=T1,H // T1=Ch(e,f,g)+h
xor r10=r10,r11
_rotr r11=$t1,$Sigma1[2] } // ROTR(e,41)
{ .mmi; xor T2=T2,r9
add K=K,X[15] };;
{ .mmi; add T1=T1,K // T1+=K[i]+X[i]
xor T2=T2,r8 // T2=((a & b) ^ (a & c) ^ (b & c))
_rotr r8=$t0,$Sigma0[0] } // ROTR(a,28)
{ .mmi; xor r11=r11,r10 // Sigma1(e)
_rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34)
{ .mmi; add T1=T1,r11 // T+=Sigma1(e)
xor r8=r8,r9
_rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39)
{ .mmi; xor r8=r8,r9 // Sigma0(a)
add D=D,T1
mux2 H=X[15],0x44 } // mov H=X[15] in sha512
{ .mib; (p16) add r9=1-$SZ,input // not used in sha512
add X[15]=T1,T2 // H=T1+Maj(a,b,c)
br.ctop.sptk .L_first16 };;
.L_first16_end:
{ .mib; mov ar.lc=$rounds-17
brp.loop.imp .L_rest,.L_rest_end-16 }
{ .mib; mov ar.ec=1
br.many .L_rest };;
.align 32
.L_rest:
{ .mmi; $LDW K=[Ktbl],$SZ
add A=A,r8 // H+=Sigma0(a) from the past
_rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
{ .mmi; add X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
$SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
{ .mib; and T1=F,E
_rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8)
{ .mib; andcm r10=G,E
$SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
// Pair of mmi; splits on Itanium 1 and prevents pipeline flush
// upon $SHRU output usage
{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g))
xor r9=r8,r9
_rotr r10=X[15-14],$sigma1[0] }// ROTR(s1,19)
{ .mmi; and T2=A,B
and r8=A,C
_rotr r11=X[15-14],$sigma1[1] };;// ROTR(s1,61)
___
$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
dep.z $t1=E,32,32 }
{ .mib; xor r10=r11,r10
zxt4 E=E };;
{ .mii; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
shrp r9=E,$t1,32+$Sigma1[0] // ROTR(e,14)
mux2 $t0=A,0x44 };; // copy lower half to upper
// Pair of mmi; splits on Itanium 1 and prevents pipeline flush
// upon mux2 output usage
{ .mmi; xor T2=T2,r8
shrp r8=E,$t1,32+$Sigma1[1]} // ROTR(e,18)
{ .mmi; and r10=B,C
add T1=T1,H // T1=Ch(e,f,g)+h
or $t1=$t1,E };;
___
$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
_rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mib; xor r10=r11,r10
xor T2=T2,r8 };;
{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
_rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
{ .mib; and r10=B,C
add T1=T1,H };; // T1+=H
___
$code.=<<___;
{ .mib; xor r9=r9,r8
_rotr r8=$t1,$Sigma1[2] } // ROTR(e,41)
{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
add X[15]=X[15],s0 };; // X[i]+=sigma0(X[i+1])
{ .mmi; xor r9=r9,r8 // Sigma1(e)
add X[15]=X[15],s1 // X[i]+=sigma0(X[i+14])
_rotr r8=$t0,$Sigma0[0] };; // ROTR(a,28)
{ .mmi; add K=K,X[15]
add T1=T1,r9 // T1+=Sigma1(e)
_rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34)
{ .mmi; add T1=T1,K // T1+=K[i]+X[i]
xor r8=r8,r9
_rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39)
{ .mib; add D=D,T1
mux2 H=X[15],0x44 } // mov H=X[15] in sha512
{ .mib; xor r8=r8,r9 // Sigma0(a)
add X[15]=T1,T2 // H=T1+Maj(a,b,c)
br.ctop.sptk .L_rest };;
.L_rest_end:
{ .mmi; add A=A,r8 };; // H+=Sigma0(a) from the past
{ .mmi; add A_=A_,A
add B_=B_,B
add C_=C_,C }
{ .mmi; add D_=D_,D
add E_=E_,E
cmp.ltu p16,p0=1,num };;
{ .mmi; add F_=F_,F
add G_=G_,G
add H_=H_,H }
{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl
(p16) add num=-1,num
(p16) br.dptk.many .L_outer };;
{ .mib; add r8=0*$SZ,ctx
add r9=1*$SZ,ctx }
{ .mib; add r10=2*$SZ,ctx
add r11=3*$SZ,ctx };;
{ .mmi; $STW [r8]=A_,4*$SZ
$STW [r9]=B_,4*$SZ
mov ar.lc=lcsave }
{ .mmi; $STW [r10]=C_,4*$SZ
$STW [r11]=D_,4*$SZ
mov pr=prsave,0x1ffff };;
{ .mmb; $STW [r8]=E_
$STW [r9]=F_ }
{ .mmb; $STW [r10]=G_
$STW [r11]=H_
br.ret.sptk.many b0 };;
.endp $func#
___
foreach(split($/,$code)) {
s/\`([^\`]*)\`/eval $1/gem;
s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
if ($BITS==64) {
s/mux2(\s+)([^=]+)=([^,]+),\S+/mov$1 $2=$3/gm;
s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
if (!$big_endian);
s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
}
print $_,"\n";
}
print<<___ if ($BITS==32);
.align 64
.type K256#,\@object
K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
data4 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
data4 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
data4 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
data4 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
data4 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
data4 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
data4 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
data4 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256#,$SZ*$rounds
stringz "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___
print<<___ if ($BITS==64);
.align 64
.type K512#,\@object
K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd
data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
data8 0x3956c25bf348b538,0x59f111f1b605d019
data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118
data8 0xd807aa98a3030242,0x12835b0145706fbe
data8 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
data8 0x72be5d74f27b896f,0x80deb1fe3b1696b1
data8 0x9bdc06a725c71235,0xc19bf174cf692694
data8 0xe49b69c19ef14ad2,0xefbe4786384f25e3
data8 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
data8 0x2de92c6f592b0275,0x4a7484aa6ea6e483
data8 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
data8 0x983e5152ee66dfab,0xa831c66d2db43210
data8 0xb00327c898fb213f,0xbf597fc7beef0ee4
data8 0xc6e00bf33da88fc2,0xd5a79147930aa725
data8 0x06ca6351e003826f,0x142929670a0e6e70
data8 0x27b70a8546d22ffc,0x2e1b21385c26c926
data8 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
data8 0x650a73548baf63de,0x766a0abb3c77b2a8
data8 0x81c2c92e47edaee6,0x92722c851482353b
data8 0xa2bfe8a14cf10364,0xa81a664bbc423001
data8 0xc24b8b70d0f89791,0xc76c51a30654be30
data8 0xd192e819d6ef5218,0xd69906245565a910
data8 0xf40e35855771202a,0x106aa07032bbd1b8
data8 0x19a4c116b8d2d0c8,0x1e376c085141ab53
data8 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
data8 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
data8 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
data8 0x748f82ee5defb2fc,0x78a5636f43172f60
data8 0x84c87814a1f0ab72,0x8cc702081a6439ec
data8 0x90befffa23631e28,0xa4506cebde82bde9
data8 0xbef9a3f7b2c67915,0xc67178f2e372532b
data8 0xca273eceea26619c,0xd186b8c721c0c207
data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
data8 0x06f067aa72176fba,0x0a637dc5a2c898a6
data8 0x113f9804bef90dae,0x1b710b35131c471b
data8 0x28db77f523047d84,0x32caab7b40c72493
data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.size K512#,$SZ*$rounds
stringz "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___

View File

@@ -0,0 +1,510 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA2 block procedures for MIPS.
# October 2010.
#
# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
# for now can only be compiled for MIPS64 ISA] improvement is modest
# ~17%, but it comes for free, because it's same instruction sequence.
# Improvement coefficients are for aligned input.
# September 2012.
#
# Add MIPS[32|64]R2 code (>25% less instructions).
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp [o32 can be
# excluded from the rule, because it's specified volatile];
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
$PTR_SUB="dsub"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$PTR_SLL="dsll"; # incidentally works even on n32
$SZREG=8;
} else {
$PTR_ADD="add";
$PTR_SUB="sub";
$REG_S="sw";
$REG_L="lw";
$PTR_SLL="sll";
$SZREG=4;
}
$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
#
# <appro@openssl.org>
#
######################################################################
$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC});
for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
if ($output =~ /512/) {
$label="512";
$SZ=8;
$LD="ld"; # load from memory
$ST="sd"; # store to memory
$SLL="dsll"; # shift left logical
$SRL="dsrl"; # shift right logical
$ADDU="daddu";
$ROTR="drotr";
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=( 7, 1, 8); # right shift first
@sigma1=( 6,19,61); # right shift first
$lastK=0x817;
$rounds=80;
} else {
$label="256";
$SZ=4;
$LD="lw"; # load from memory
$ST="sw"; # store to memory
$SLL="sll"; # shift left logical
$SRL="srl"; # shift right logical
$ADDU="addu";
$ROTR="rotr";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 3, 7,18); # right shift first
@sigma1=(10,17,19); # right shift first
$lastK=0x8f2;
$rounds=64;
}
$MSB = $big_endian ? 0 : ($SZ-1);
$LSB = ($SZ-1)&~$MSB;
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
@X=map("\$$_",(8..23));
$ctx=$a0;
$inp=$a1;
$len=$a2; $Ktbl=$len;
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
$code.=<<___ if ($i<15);
${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp)
${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
___
$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
wsbh @X[0],@X[0] # byte swap($i)
rotr @X[0],@X[0],16
#else
srl $tmp0,@X[0],24 # byte swap($i)
srl $tmp1,@X[0],8
andi $tmp2,@X[0],0xFF00
sll @X[0],@X[0],24
andi $tmp1,0xFF00
sll $tmp2,$tmp2,8
or @X[0],$tmp0
or $tmp1,$tmp2
or @X[0],$tmp1
#endif
___
$code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
#if defined(_MIPS_ARCH_MIPS64R2)
dsbh @X[0],@X[0] # byte swap($i)
dshd @X[0],@X[0]
#else
ori $tmp0,$zero,0xFF
dsll $tmp2,$tmp0,32
or $tmp0,$tmp2 # 0x000000FF000000FF
and $tmp1,@X[0],$tmp0 # byte swap($i)
dsrl $tmp2,@X[0],24
dsll $tmp1,24
and $tmp2,$tmp0
dsll $tmp0,8 # 0x0000FF000000FF00
or $tmp1,$tmp2
and $tmp2,@X[0],$tmp0
dsrl @X[0],8
dsll $tmp2,8
and @X[0],$tmp0
or $tmp1,$tmp2
or @X[0],$tmp1
dsrl $tmp1,@X[0],32
dsll @X[0],32
or @X[0],$tmp1
#endif
___
$code.=<<___;
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
xor $tmp2,$f,$g # $i
$ROTR $tmp0,$e,@Sigma1[0]
$ADDU $T1,$X[0],$h
$ROTR $tmp1,$e,@Sigma1[1]
and $tmp2,$e
$ROTR $h,$e,@Sigma1[2]
xor $tmp0,$tmp1
$ROTR $tmp1,$a,@Sigma0[0]
xor $tmp2,$g # Ch(e,f,g)
xor $tmp0,$h # Sigma1(e)
$ROTR $h,$a,@Sigma0[1]
$ADDU $T1,$tmp2
$LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
xor $h,$tmp1
$ROTR $tmp1,$a,@Sigma0[2]
$ADDU $T1,$tmp0
and $tmp0,$b,$c
xor $h,$tmp1 # Sigma0(a)
xor $tmp1,$b,$c
#else
$ADDU $T1,$X[0],$h # $i
$SRL $h,$e,@Sigma1[0]
xor $tmp2,$f,$g
$SLL $tmp1,$e,`$SZ*8-@Sigma1[2]`
and $tmp2,$e
$SRL $tmp0,$e,@Sigma1[1]
xor $h,$tmp1
$SLL $tmp1,$e,`$SZ*8-@Sigma1[1]`
xor $h,$tmp0
$SRL $tmp0,$e,@Sigma1[2]
xor $h,$tmp1
$SLL $tmp1,$e,`$SZ*8-@Sigma1[0]`
xor $h,$tmp0
xor $tmp2,$g # Ch(e,f,g)
xor $tmp0,$tmp1,$h # Sigma1(e)
$SRL $h,$a,@Sigma0[0]
$ADDU $T1,$tmp2
$LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
$SLL $tmp1,$a,`$SZ*8-@Sigma0[2]`
$ADDU $T1,$tmp0
$SRL $tmp0,$a,@Sigma0[1]
xor $h,$tmp1
$SLL $tmp1,$a,`$SZ*8-@Sigma0[1]`
xor $h,$tmp0
$SRL $tmp0,$a,@Sigma0[2]
xor $h,$tmp1
$SLL $tmp1,$a,`$SZ*8-@Sigma0[0]`
xor $h,$tmp0
and $tmp0,$b,$c
xor $h,$tmp1 # Sigma0(a)
xor $tmp1,$b,$c
#endif
$ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
$ADDU $h,$tmp0
and $tmp1,$a
$ADDU $T1,$tmp2 # +=K[$i]
$ADDU $h,$tmp1 # +=Maj(a,b,c)
$ADDU $d,$T1
$ADDU $h,$T1
___
$code.=<<___ if ($i>=13);
$LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer
___
}
sub BODY_16_XX {
my $i=@_[0];
my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
$code.=<<___;
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
$SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
$ROTR $tmp0,@X[1],@sigma0[1]
$ADDU @X[0],@X[9] # +=X[i+9]
xor $tmp2,$tmp0
$ROTR $tmp0,@X[1],@sigma0[2]
$SRL $tmp3,@X[14],@sigma1[0]
$ROTR $tmp1,@X[14],@sigma1[1]
xor $tmp2,$tmp0 # sigma0(X[i+1])
$ROTR $tmp0,@X[14],@sigma1[2]
xor $tmp3,$tmp1
$ADDU @X[0],$tmp2
#else
$SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
$ADDU @X[0],@X[9] # +=X[i+9]
$SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]`
$SRL $tmp0,@X[1],@sigma0[1]
xor $tmp2,$tmp1
$SLL $tmp1,`@sigma0[2]-@sigma0[1]`
xor $tmp2,$tmp0
$SRL $tmp0,@X[1],@sigma0[2]
xor $tmp2,$tmp1
$SRL $tmp3,@X[14],@sigma1[0]
xor $tmp2,$tmp0 # sigma0(X[i+1])
$SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]`
$ADDU @X[0],$tmp2
$SRL $tmp0,@X[14],@sigma1[1]
xor $tmp3,$tmp1
$SLL $tmp1,`@sigma1[2]-@sigma1[1]`
xor $tmp3,$tmp0
$SRL $tmp0,@X[14],@sigma1[2]
xor $tmp3,$tmp1
#endif
xor $tmp3,$tmp0 # sigma1(X[i+14])
$ADDU @X[0],$tmp3
___
&BODY_00_15(@_);
}
$FRAMESIZE=16*$SZ+16*$SZREG;
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
$code.=<<___;
#ifdef OPENSSL_FIPSCANISTER
# include <openssl/fipssyms.h>
#endif
#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2)
#define _MIPS_ARCH_MIPS32R2
#endif
.text
.set noat
#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
.option pic2
#endif
.align 5
.globl sha${label}_block_data_order
.ent sha${label}_block_data_order
sha${label}_block_data_order:
.frame $sp,$FRAMESIZE,$ra
.mask $SAVED_REGS_MASK,-$SZREG
.set noreorder
___
$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
.cpload $pf
___
$code.=<<___;
$PTR_SUB $sp,$FRAMESIZE
$REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
$REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
$REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
$REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
$REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
$REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
$REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
$REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
$REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
$REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
$REG_S $s3,$FRAMESIZE-11*$SZREG($sp)
$REG_S $s2,$FRAMESIZE-12*$SZREG($sp)
$REG_S $s1,$FRAMESIZE-13*$SZREG($sp)
$REG_S $s0,$FRAMESIZE-14*$SZREG($sp)
$REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
___
$code.=<<___;
$PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
___
$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
.cplocal $Ktbl
.cpsetup $pf,$zero,sha${label}_block_data_order
___
$code.=<<___;
.set reorder
la $Ktbl,K${label} # PIC-ified 'load address'
$LD $A,0*$SZ($ctx) # load context
$LD $B,1*$SZ($ctx)
$LD $C,2*$SZ($ctx)
$LD $D,3*$SZ($ctx)
$LD $E,4*$SZ($ctx)
$LD $F,5*$SZ($ctx)
$LD $G,6*$SZ($ctx)
$LD $H,7*$SZ($ctx)
$PTR_ADD @X[15],$inp # pointer to the end of input
$REG_S @X[15],16*$SZ($sp)
b .Loop
.align 5
.Loop:
${LD}l @X[0],$MSB($inp)
${LD}r @X[0],$LSB($inp)
___
for ($i=0;$i<16;$i++)
{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
$code.=<<___;
b .L16_xx
.align 4
.L16_xx:
___
for (;$i<32;$i++)
{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
$code.=<<___;
and @X[6],0xfff
li @X[7],$lastK
.set noreorder
bne @X[6],@X[7],.L16_xx
$PTR_ADD $Ktbl,16*$SZ # Ktbl+=16
$REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input
$LD @X[0],0*$SZ($ctx)
$LD @X[1],1*$SZ($ctx)
$LD @X[2],2*$SZ($ctx)
$PTR_ADD $inp,16*$SZ
$LD @X[3],3*$SZ($ctx)
$ADDU $A,@X[0]
$LD @X[4],4*$SZ($ctx)
$ADDU $B,@X[1]
$LD @X[5],5*$SZ($ctx)
$ADDU $C,@X[2]
$LD @X[6],6*$SZ($ctx)
$ADDU $D,@X[3]
$LD @X[7],7*$SZ($ctx)
$ADDU $E,@X[4]
$ST $A,0*$SZ($ctx)
$ADDU $F,@X[5]
$ST $B,1*$SZ($ctx)
$ADDU $G,@X[6]
$ST $C,2*$SZ($ctx)
$ADDU $H,@X[7]
$ST $D,3*$SZ($ctx)
$ST $E,4*$SZ($ctx)
$ST $F,5*$SZ($ctx)
$ST $G,6*$SZ($ctx)
$ST $H,7*$SZ($ctx)
bne $inp,@X[15],.Loop
$PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl
$REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
$REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
$REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
$REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
$REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
$REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
$REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
$REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
$REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
$REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
$REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
$REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
$REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
$REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE
.end sha${label}_block_data_order
.rdata
.align 5
K${label}:
___
if ($SZ==4) {
$code.=<<___;
.word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
___
} else {
$code.=<<___;
.dword 0x428a2f98d728ae22, 0x7137449123ef65cd
.dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
.dword 0x3956c25bf348b538, 0x59f111f1b605d019
.dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
.dword 0xd807aa98a3030242, 0x12835b0145706fbe
.dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
.dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
.dword 0x9bdc06a725c71235, 0xc19bf174cf692694
.dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
.dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
.dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
.dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
.dword 0x983e5152ee66dfab, 0xa831c66d2db43210
.dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
.dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
.dword 0x06ca6351e003826f, 0x142929670a0e6e70
.dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
.dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
.dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
.dword 0x81c2c92e47edaee6, 0x92722c851482353b
.dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
.dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
.dword 0xd192e819d6ef5218, 0xd69906245565a910
.dword 0xf40e35855771202a, 0x106aa07032bbd1b8
.dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
.dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
.dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
.dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
.dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
.dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
.dword 0x90befffa23631e28, 0xa4506cebde82bde9
.dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
.dword 0xca273eceea26619c, 0xd186b8c721c0c207
.dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
.dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
.dword 0x113f9804bef90dae, 0x1b710b35131c471b
.dword 0x28db77f523047d84, 0x32caab7b40c72493
.dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
.dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
.dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
___
}
$code.=<<___;
.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,793 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256/512 block procedure for PA-RISC.
# June 2009.
#
# SHA256 performance is >75% better than gcc 3.2 generated code on
# PA-7100LC. Compared to code generated by vendor compiler this
# implementation is almost 70% faster in 64-bit build, but delivers
# virtually same performance in 32-bit build on PA-8600.
#
# SHA512 performance is >2.9x better than gcc 3.2 generated code on
# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
# code is executed on PA-RISC 2.0 processor and switches to 64-bit
# code path delivering adequate performance even in "blended" 32-bit
# build. Though 64-bit code is not any faster than code generated by
# vendor compiler on PA-8600...
#
# Special thanks to polarhome.com for providing HP-UX account.
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
} else {
$LEVEL ="1.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
}
if ($output =~ /512/) {
$func="sha512_block_data_order";
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$LAST10BITS=0x017;
$LD="ldd";
$LDM="ldd,ma";
$ST="std";
} else {
$func="sha256_block_data_order";
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$LAST10BITS=0x0f2;
$LD="ldw";
$LDM="ldwm";
$ST="stw";
}
$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
# [+ argument transfer]
$XOFF=16*$SZ+32; # local variables
$FRAME+=$XOFF;
$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
$ctx="%r26"; # zapped by $a0
$inp="%r25"; # zapped by $a1
$num="%r24"; # zapped by $t0
$a0 ="%r26";
$a1 ="%r25";
$t0 ="%r24";
$t1 ="%r29";
$Tbl="%r31";
@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
sub ROUND_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$code.=<<___;
_ror $e,$Sigma1[0],$a0
and $f,$e,$t0
_ror $e,$Sigma1[1],$a1
addl $t1,$h,$h
andcm $g,$e,$t1
xor $a1,$a0,$a0
_ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
or $t0,$t1,$t1 ; Ch(e,f,g)
addl @X[$i%16],$h,$h
xor $a0,$a1,$a1 ; Sigma1(e)
addl $t1,$h,$h
_ror $a,$Sigma0[0],$a0
addl $a1,$h,$h
_ror $a,$Sigma0[1],$a1
and $a,$b,$t0
and $a,$c,$t1
xor $a1,$a0,$a0
_ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
xor $t1,$t0,$t0
and $b,$c,$t1
xor $a0,$a1,$a1 ; Sigma0(a)
addl $h,$d,$d
xor $t1,$t0,$t0 ; Maj(a,b,c)
`"$LDM $SZ($Tbl),$t1" if ($i<15)`
addl $a1,$h,$h
addl $t0,$h,$h
___
}
sub ROUND_16_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$i-=16;
$code.=<<___;
_ror @X[($i+1)%16],$sigma0[0],$a0
_ror @X[($i+1)%16],$sigma0[1],$a1
addl @X[($i+9)%16],@X[$i],@X[$i]
_ror @X[($i+14)%16],$sigma1[0],$t0
_ror @X[($i+14)%16],$sigma1[1],$t1
xor $a1,$a0,$a0
_shr @X[($i+1)%16],$sigma0[2],$a1
xor $t1,$t0,$t0
_shr @X[($i+14)%16],$sigma1[2],$t1
xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
$LDM $SZ($Tbl),$t1
addl $a0,@X[$i],@X[$i]
addl $t0,@X[$i],@X[$i]
___
$code.=<<___ if ($i==15);
extru $t1,31,10,$a1
comiclr,<> $LAST10BITS,$a1,%r0
ldo 1($Tbl),$Tbl ; signal end of $Tbl
___
&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
}
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.ALIGN 64
L\$table
___
$code.=<<___ if ($SZ==8);
.WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
.WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
.WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
.WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
.WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
.WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
.WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
.WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
.WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
.WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
.WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
.WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
.WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
.WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
.WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
.WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
.WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
.WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
.WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
.WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
.WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
.WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
.WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
.WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
.WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
.WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
.WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
.WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
.WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
.WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
.WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
.WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
.WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
.WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
.WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
.WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
.WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
.WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
.WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
.WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
___
$code.=<<___ if ($SZ==4);
.WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
$code.=<<___;
.EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
.ALIGN 64
$func
.PROC
.CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
$PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
$PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
_shl $num,`log(16*$SZ)/log(2)`,$num
addl $inp,$num,$num ; $num to point at the end of $inp
$PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
$PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
blr %r0,$Tbl
ldi 3,$t1
L\$pic
andcm $Tbl,$t1,$Tbl ; wipe privilege level
ldo L\$table-L\$pic($Tbl),$Tbl
___
$code.=<<___ if ($SZ==8 && $SIZE_T==4);
ldi 31,$t1
mtctl $t1,%cr11
extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
b L\$parisc1
nop
___
$code.=<<___;
$LD `0*$SZ`($ctx),$A ; load context
$LD `1*$SZ`($ctx),$B
$LD `2*$SZ`($ctx),$C
$LD `3*$SZ`($ctx),$D
$LD `4*$SZ`($ctx),$E
$LD `5*$SZ`($ctx),$F
$LD `6*$SZ`($ctx),$G
$LD `7*$SZ`($ctx),$H
extru $inp,31,`log($SZ)/log(2)`,$t0
sh3addl $t0,%r0,$t0
subi `8*$SZ`,$t0,$t0
mtctl $t0,%cr11 ; load %sar with align factor
L\$oop
ldi `$SZ-1`,$t0
$LDM $SZ($Tbl),$t1
andcm $inp,$t0,$t0 ; align $inp
___
for ($i=0;$i<15;$i++) { # load input block
$code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
$code.=<<___;
cmpb,*= $inp,$t0,L\$aligned
$LD `$SZ*15`($t0),@X[15]
$LD `$SZ*16`($t0),@X[16]
___
for ($i=0;$i<16;$i++) { # align data
$code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
$code.=<<___;
L\$aligned
nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
___
for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
L\$rounds
nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
___
for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
nop
$POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
$POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
$POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
$LD `0*$SZ`($ctx),@X[0] ; load context
$LD `1*$SZ`($ctx),@X[1]
$LD `2*$SZ`($ctx),@X[2]
$LD `3*$SZ`($ctx),@X[3]
$LD `4*$SZ`($ctx),@X[4]
$LD `5*$SZ`($ctx),@X[5]
addl @X[0],$A,$A
$LD `6*$SZ`($ctx),@X[6]
addl @X[1],$B,$B
$LD `7*$SZ`($ctx),@X[7]
ldo `16*$SZ`($inp),$inp ; advance $inp
$ST $A,`0*$SZ`($ctx) ; save context
addl @X[2],$C,$C
$ST $B,`1*$SZ`($ctx)
addl @X[3],$D,$D
$ST $C,`2*$SZ`($ctx)
addl @X[4],$E,$E
$ST $D,`3*$SZ`($ctx)
addl @X[5],$F,$F
$ST $E,`4*$SZ`($ctx)
addl @X[6],$G,$G
$ST $F,`5*$SZ`($ctx)
addl @X[7],$H,$H
$ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx)
cmpb,*<>,n $inp,$num,L\$oop
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
___
if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
{{
$code.=<<___;
b L\$done
nop
.ALIGN 64
L\$parisc1
___
@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
$Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
$a0 ="%r17";
$a1 ="%r18";
$a2 ="%r19";
$a3 ="%r20";
$t0 ="%r21";
$t1 ="%r22";
$t2 ="%r28";
$t3 ="%r29";
$Tbl="%r31";
@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
sub ROUND_00_15_pa1 {
my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
$ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
$code.=<<___ if (!$flag);
ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
___
$code.=<<___;
shd $ehi,$elo,$Sigma1[0],$t0
add $Xlo,$hlo,$hlo
shd $elo,$ehi,$Sigma1[0],$t1
addc $Xhi,$hhi,$hhi ; h += X[i]
shd $ehi,$elo,$Sigma1[1],$t2
ldwm 8($Tbl),$Xhi
shd $elo,$ehi,$Sigma1[1],$t3
ldw -4($Tbl),$Xlo ; load K[i]
xor $t2,$t0,$t0
xor $t3,$t1,$t1
and $flo,$elo,$a0
and $fhi,$ehi,$a1
shd $ehi,$elo,$Sigma1[2],$t2
andcm $glo,$elo,$a2
shd $elo,$ehi,$Sigma1[2],$t3
andcm $ghi,$ehi,$a3
xor $t2,$t0,$t0
xor $t3,$t1,$t1 ; Sigma1(e)
add $Xlo,$hlo,$hlo
xor $a2,$a0,$a0
addc $Xhi,$hhi,$hhi ; h += K[i]
xor $a3,$a1,$a1 ; Ch(e,f,g)
add $t0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[0],$t0
addc $t1,$hhi,$hhi ; h += Sigma1(e)
shd $alo,$ahi,$Sigma0[0],$t1
add $a0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[1],$t2
addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
shd $alo,$ahi,$Sigma0[1],$t3
xor $t2,$t0,$t0
xor $t3,$t1,$t1
shd $ahi,$alo,$Sigma0[2],$t2
and $alo,$blo,$a0
shd $alo,$ahi,$Sigma0[2],$t3
and $ahi,$bhi,$a1
xor $t2,$t0,$t0
xor $t3,$t1,$t1 ; Sigma0(a)
and $alo,$clo,$a2
and $ahi,$chi,$a3
xor $a2,$a0,$a0
add $hlo,$dlo,$dlo
xor $a3,$a1,$a1
addc $hhi,$dhi,$dhi ; d += h
and $blo,$clo,$a2
add $t0,$hlo,$hlo
and $bhi,$chi,$a3
addc $t1,$hhi,$hhi ; h += Sigma0(a)
xor $a2,$a0,$a0
add $a0,$hlo,$hlo
xor $a3,$a1,$a1 ; Maj(a,b,c)
addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
___
$code.=<<___ if ($i==15 && $flag);
extru $Xlo,31,10,$Xlo
comiclr,= $LAST10BITS,$Xlo,%r0
b L\$rounds_pa1
nop
___
push(@X,shift(@X)); push(@X,shift(@X));
}
sub ROUND_16_xx_pa1 {
my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
my ($i)=shift;
$i-=16;
$code.=<<___;
ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
shd $Xnhi,$Xnlo,$sigma0[0],$t0
shd $Xnlo,$Xnhi,$sigma0[0],$t1
add $a0,$Xlo,$Xlo
shd $Xnhi,$Xnlo,$sigma0[1],$t2
addc $a1,$Xhi,$Xhi
shd $Xnlo,$Xnhi,$sigma0[1],$t3
xor $t2,$t0,$t0
shd $Xnhi,$Xnlo,$sigma0[2],$t2
xor $t3,$t1,$t1
extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
xor $t2,$t0,$t0
shd $a3,$a2,$sigma1[0],$a0
xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
shd $a2,$a3,$sigma1[0],$a1
add $t0,$Xlo,$Xlo
shd $a3,$a2,$sigma1[1],$t2
addc $t1,$Xhi,$Xhi
shd $a2,$a3,$sigma1[1],$t3
xor $t2,$a0,$a0
shd $a3,$a2,$sigma1[2],$t2
xor $t3,$a1,$a1
extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
xor $t2,$a0,$a0
xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
add $a0,$Xlo,$Xlo
addc $a1,$Xhi,$Xhi
stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
___
&ROUND_00_15_pa1($i,@_,1);
}
$code.=<<___;
ldw `0*4`($ctx),$Ahi ; load context
ldw `1*4`($ctx),$Alo
ldw `2*4`($ctx),$Bhi
ldw `3*4`($ctx),$Blo
ldw `4*4`($ctx),$Chi
ldw `5*4`($ctx),$Clo
ldw `6*4`($ctx),$Dhi
ldw `7*4`($ctx),$Dlo
ldw `8*4`($ctx),$Ehi
ldw `9*4`($ctx),$Elo
ldw `10*4`($ctx),$Fhi
ldw `11*4`($ctx),$Flo
ldw `12*4`($ctx),$Ghi
ldw `13*4`($ctx),$Glo
ldw `14*4`($ctx),$Hhi
ldw `15*4`($ctx),$Hlo
extru $inp,31,2,$t0
sh3addl $t0,%r0,$t0
subi 32,$t0,$t0
mtctl $t0,%cr11 ; load %sar with align factor
L\$oop_pa1
extru $inp,31,2,$a3
comib,= 0,$a3,L\$aligned_pa1
sub $inp,$a3,$inp
ldw `0*4`($inp),$X[0]
ldw `1*4`($inp),$X[1]
ldw `2*4`($inp),$t2
ldw `3*4`($inp),$t3
ldw `4*4`($inp),$a0
ldw `5*4`($inp),$a1
ldw `6*4`($inp),$a2
ldw `7*4`($inp),$a3
vshd $X[0],$X[1],$X[0]
vshd $X[1],$t2,$X[1]
stw $X[0],`-$XOFF+0*4`(%sp)
ldw `8*4`($inp),$t0
vshd $t2,$t3,$t2
stw $X[1],`-$XOFF+1*4`(%sp)
ldw `9*4`($inp),$t1
vshd $t3,$a0,$t3
___
{
my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
for ($i=2;$i<=(128/4-8);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
ldw `(8+$i)*4`($inp),$t[0]
vshd $t[1],$t[2],$t[1]
___
push(@t,shift(@t));
}
for (;$i<(128/4-1);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
vshd $t[1],$t[2],$t[1]
___
push(@t,shift(@t));
}
$code.=<<___;
b L\$collected_pa1
stw $t[0],`-$XOFF+$i*4`(%sp)
___
}
$code.=<<___;
L\$aligned_pa1
ldw `0*4`($inp),$X[0]
ldw `1*4`($inp),$X[1]
ldw `2*4`($inp),$t2
ldw `3*4`($inp),$t3
ldw `4*4`($inp),$a0
ldw `5*4`($inp),$a1
ldw `6*4`($inp),$a2
ldw `7*4`($inp),$a3
stw $X[0],`-$XOFF+0*4`(%sp)
ldw `8*4`($inp),$t0
stw $X[1],`-$XOFF+1*4`(%sp)
ldw `9*4`($inp),$t1
___
{
my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
for ($i=2;$i<(128/4-8);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
ldw `(8+$i)*4`($inp),$t[0]
___
push(@t,shift(@t));
}
for (;$i<128/4;$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
___
push(@t,shift(@t));
}
$code.="L\$collected_pa1\n";
}
for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
$code.="L\$rounds_pa1\n";
for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
$code.=<<___;
$POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
$POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
$POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
ldw `0*4`($ctx),$t1 ; update context
ldw `1*4`($ctx),$t0
ldw `2*4`($ctx),$t3
ldw `3*4`($ctx),$t2
ldw `4*4`($ctx),$a1
ldw `5*4`($ctx),$a0
ldw `6*4`($ctx),$a3
add $t0,$Alo,$Alo
ldw `7*4`($ctx),$a2
addc $t1,$Ahi,$Ahi
ldw `8*4`($ctx),$t1
add $t2,$Blo,$Blo
ldw `9*4`($ctx),$t0
addc $t3,$Bhi,$Bhi
ldw `10*4`($ctx),$t3
add $a0,$Clo,$Clo
ldw `11*4`($ctx),$t2
addc $a1,$Chi,$Chi
ldw `12*4`($ctx),$a1
add $a2,$Dlo,$Dlo
ldw `13*4`($ctx),$a0
addc $a3,$Dhi,$Dhi
ldw `14*4`($ctx),$a3
add $t0,$Elo,$Elo
ldw `15*4`($ctx),$a2
addc $t1,$Ehi,$Ehi
stw $Ahi,`0*4`($ctx)
add $t2,$Flo,$Flo
stw $Alo,`1*4`($ctx)
addc $t3,$Fhi,$Fhi
stw $Bhi,`2*4`($ctx)
add $a0,$Glo,$Glo
stw $Blo,`3*4`($ctx)
addc $a1,$Ghi,$Ghi
stw $Chi,`4*4`($ctx)
add $a2,$Hlo,$Hlo
stw $Clo,`5*4`($ctx)
addc $a3,$Hhi,$Hhi
stw $Dhi,`6*4`($ctx)
ldo `16*$SZ`($inp),$inp ; advance $inp
stw $Dlo,`7*4`($ctx)
stw $Ehi,`8*4`($ctx)
stw $Elo,`9*4`($ctx)
stw $Fhi,`10*4`($ctx)
stw $Flo,`11*4`($ctx)
stw $Ghi,`12*4`($ctx)
stw $Glo,`13*4`($ctx)
stw $Hhi,`14*4`($ctx)
comb,= $inp,$num,L\$done
stw $Hlo,`15*4`($ctx)
b L\$oop_pa1
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
L\$done
___
}}
$code.=<<___;
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
$POP `-$FRAME+14*$SIZE_T`(%sp),%r17
$POP `-$FRAME+15*$SIZE_T`(%sp),%r18
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
{ my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
$opcode|=(1<<3) if ($mod =~ /^,m/);
$opcode|=(1<<2) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
{ my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
{ sprintf "\t.WORD\t0x%08x\t; %s",
(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
: sprintf("shd\t%$1,%$2,%d",$3)/e or
# translate made up instructons: _ror, _shr, _align, _shl
s/_ror(\s+)(%r[0-9]+),/
($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
s/_shr(\s+%r[0-9]+),([0-9]+),/
$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
: sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
s/_align(\s+%r[0-9]+,%r[0-9]+),/
($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
s/_shl(\s+%r[0-9]+),([0-9]+),/
$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
: sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
s/cmpb,\*/comb,/ if ($SIZE_T==4);
s/\bbv\b/bve/ if ($SIZE_T==8);
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,792 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# I let hardware handle unaligned input, except on page boundaries
# (see below for details). Otherwise straightforward implementation
# with X vector in register bank.
# sha256 | sha512
# -m64 -m32 | -m64 -m32
# --------------------------------------+-----------------------
# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
# Power6,xlc-7 +150% +90% | +100% +430%(*)
#
# (*) 64-bit code in 32-bit application context, which actually is
# on TODO list. It should be noted that for safe deployment in
# 32-bit *mutli-threaded* context asyncronous signals should be
# blocked upon entry to SHA512 block routine. This is because
# 32-bit signaling procedure invalidates upper halves of GPRs.
# Context switch procedure preserves them, but not signaling:-(
# Second version is true multi-thread safe. Trouble with the original
# version was that it was using thread local storage pointer register.
# Well, it scrupulously preserved it, but the problem would arise the
# moment asynchronous signal was delivered and signal handler would
# dereference the TLS pointer. While it's never the case in openssl
# application or test suite, we have to respect this scenario and not
# use TLS pointer register. Alternative would be to require caller to
# block signals prior calling this routine. For the record, in 32-bit
# context R2 serves as TLS pointer, while in 64-bit context - R13.
$flavour=shift;
$output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$UCMP="cmpld";
$SHL="sldi";
$POP="ld";
$PUSH="std";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$UCMP="cmplw";
$SHL="slwi";
$POP="lwz";
$PUSH="stw";
} else { die "nonsense $flavour"; }
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
if ($output =~ /512/) {
$func="sha512_block_ppc";
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$LD="ld";
$ST="std";
$ROR="rotrdi";
$SHR="srdi";
} else {
$func="sha256_block_ppc";
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$LD="lwz";
$ST="stw";
$ROR="rotrwi";
$SHR="srwi";
}
$FRAME=32*$SIZE_T+16*$SZ;
$LOCALS=6*$SIZE_T;
$sp ="r1";
$toc="r2";
$ctx="r3"; # zapped by $a0
$inp="r4"; # zapped by $a1
$num="r5"; # zapped by $t0
$T ="r0";
$a0 ="r3";
$a1 ="r4";
$t0 ="r5";
$t1 ="r6";
$Tbl="r7";
$A ="r8";
$B ="r9";
$C ="r10";
$D ="r11";
$E ="r12";
$F =$t1; $t1 = "r0"; # stay away from "r13";
$G ="r14";
$H ="r15";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
@X=("r16","r17","r18","r19","r20","r21","r22","r23",
"r24","r25","r26","r27","r28","r29","r30","r31");
$inp="r31" if($SZ==4 || $SIZE_T==8); # reassigned $inp! aliases with @X[15]
sub ROUND_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$code.=<<___;
$ROR $a0,$e,$Sigma1[0]
$ROR $a1,$e,$Sigma1[1]
and $t0,$f,$e
xor $a0,$a0,$a1
add $h,$h,$t1
andc $t1,$g,$e
$ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
or $t0,$t0,$t1 ; Ch(e,f,g)
add $h,$h,@X[$i%16]
xor $a0,$a0,$a1 ; Sigma1(e)
add $h,$h,$t0
add $h,$h,$a0
$ROR $a0,$a,$Sigma0[0]
$ROR $a1,$a,$Sigma0[1]
and $t0,$a,$b
and $t1,$a,$c
xor $a0,$a0,$a1
$ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
xor $t0,$t0,$t1
and $t1,$b,$c
xor $a0,$a0,$a1 ; Sigma0(a)
add $d,$d,$h
xor $t0,$t0,$t1 ; Maj(a,b,c)
___
$code.=<<___ if ($i<15);
$LD $t1,`($i+1)*$SZ`($Tbl)
___
$code.=<<___;
add $h,$h,$a0
add $h,$h,$t0
___
}
sub ROUND_16_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$i-=16;
$code.=<<___;
$ROR $a0,@X[($i+1)%16],$sigma0[0]
$ROR $a1,@X[($i+1)%16],$sigma0[1]
$ROR $t0,@X[($i+14)%16],$sigma1[0]
$ROR $t1,@X[($i+14)%16],$sigma1[1]
xor $a0,$a0,$a1
$SHR $a1,@X[($i+1)%16],$sigma0[2]
xor $t0,$t0,$t1
$SHR $t1,@X[($i+14)%16],$sigma1[2]
add @X[$i],@X[$i],@X[($i+9)%16]
xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
$LD $t1,`$i*$SZ`($Tbl)
add @X[$i],@X[$i],$a0
add @X[$i],@X[$i],$t0
___
&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
}
$code=<<___;
.machine "any"
.text
.globl $func
.align 6
$func:
$STU $sp,-$FRAME($sp)
mflr r0
$SHL $num,$num,`log(16*$SZ)/log(2)`
$PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
___
if ($SZ==4 || $SIZE_T==8) {
$code.=<<___;
$LD $A,`0*$SZ`($ctx)
mr $inp,r4 ; incarnate $inp
$LD $B,`1*$SZ`($ctx)
$LD $C,`2*$SZ`($ctx)
$LD $D,`3*$SZ`($ctx)
$LD $E,`4*$SZ`($ctx)
$LD $F,`5*$SZ`($ctx)
$LD $G,`6*$SZ`($ctx)
$LD $H,`7*$SZ`($ctx)
___
} else {
for ($i=16;$i<32;$i++) {
$code.=<<___;
lwz r$i,`$LITTLE_ENDIAN^(4*($i-16))`($ctx)
___
}
}
$code.=<<___;
bl LPICmeup
LPICedup:
andi. r0,$inp,3
bne Lunaligned
Laligned:
add $num,$inp,$num
$PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
b Ldone
; PowerPC specification allows an implementation to be ill-behaved
; upon unaligned access which crosses page boundary. "Better safe
; than sorry" principle makes me treat it specially. But I don't
; look for particular offending word, but rather for the input
; block which crosses the boundary. Once found that block is aligned
; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
beq Lcross_page
$UCMP $num,$t1
ble- Laligned ; didn't cross the page boundary
subfc $num,$t1,$num
add $t1,$inp,$t1
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
; $inp equals to the intermediate end pointer here
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
Lcross_page:
li $t1,`16*$SZ/4`
mtctr $t1
___
if ($SZ==4 || $SIZE_T==8) {
$code.=<<___;
addi r20,$sp,$LOCALS ; aligned spot below the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
lbz r18,2($inp)
lbz r19,3($inp)
addi $inp,$inp,4
stb r16,0(r20)
stb r17,1(r20)
stb r18,2(r20)
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
___
} else {
$code.=<<___;
addi r12,$sp,$LOCALS ; aligned spot below the frame
Lmemcpy:
lbz r8,0($inp)
lbz r9,1($inp)
lbz r10,2($inp)
lbz r11,3($inp)
addi $inp,$inp,4
stb r8,0(r12)
stb r9,1(r12)
stb r10,2(r12)
stb r11,3(r12)
addi r12,r12,4
bdnz Lmemcpy
___
}
$code.=<<___;
$PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
addi $inp,$sp,$LOCALS ; fictitious inp pointer
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
$POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
addic. $num,$num,`-16*$SZ` ; num--
bne- Lunaligned
Ldone:
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
___
if ($SZ==4 || $SIZE_T==8) {
$code.=<<___;
.align 4
Lsha2_block_private:
$LD $t1,0($Tbl)
___
for($i=0;$i<16;$i++) {
$code.=<<___ if ($SZ==4 && !$LITTLE_ENDIAN);
lwz @X[$i],`$i*$SZ`($inp)
___
$code.=<<___ if ($SZ==4 && $LITTLE_ENDIAN);
lwz $a0,`$i*$SZ`($inp)
rotlwi @X[$i],$a0,8
rlwimi @X[$i],$a0,24,0,7
rlwimi @X[$i],$a0,24,16,23
___
# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
# unaligned 64-bit loads, only 32-bit ones...
$code.=<<___ if ($SZ==8 && !$LITTLE_ENDIAN);
lwz $t0,`$i*$SZ`($inp)
lwz @X[$i],`$i*$SZ+4`($inp)
insrdi @X[$i],$t0,32,0
___
$code.=<<___ if ($SZ==8 && $LITTLE_ENDIAN);
lwz $a0,`$i*$SZ`($inp)
lwz $a1,`$i*$SZ+4`($inp)
rotlwi $t0,$a0,8
rotlwi @X[$i],$a1,8
rlwimi $t0,$a0,24,0,7
rlwimi @X[$i],$a1,24,0,7
rlwimi $t0,$a0,24,16,23
rlwimi @X[$i],$a1,24,16,23
insrdi @X[$i],$t0,32,0
___
&ROUND_00_15($i,@V);
unshift(@V,pop(@V));
}
$code.=<<___;
li $t0,`$rounds/16-1`
mtctr $t0
.align 4
Lrounds:
addi $Tbl,$Tbl,`16*$SZ`
___
for(;$i<32;$i++) {
&ROUND_16_xx($i,@V);
unshift(@V,pop(@V));
}
$code.=<<___;
bdnz- Lrounds
$POP $ctx,`$FRAME-$SIZE_T*22`($sp)
$POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
$POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
$LD r16,`0*$SZ`($ctx)
$LD r17,`1*$SZ`($ctx)
$LD r18,`2*$SZ`($ctx)
$LD r19,`3*$SZ`($ctx)
$LD r20,`4*$SZ`($ctx)
$LD r21,`5*$SZ`($ctx)
$LD r22,`6*$SZ`($ctx)
addi $inp,$inp,`16*$SZ` ; advance inp
$LD r23,`7*$SZ`($ctx)
add $A,$A,r16
add $B,$B,r17
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
add $C,$C,r18
$ST $A,`0*$SZ`($ctx)
add $D,$D,r19
$ST $B,`1*$SZ`($ctx)
add $E,$E,r20
$ST $C,`2*$SZ`($ctx)
add $F,$F,r21
$ST $D,`3*$SZ`($ctx)
add $G,$G,r22
$ST $E,`4*$SZ`($ctx)
add $H,$H,r23
$ST $F,`5*$SZ`($ctx)
$ST $G,`6*$SZ`($ctx)
$UCMP $inp,$num
$ST $H,`7*$SZ`($ctx)
bne Lsha2_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.size $func,.-$func
___
} else {
########################################################################
# SHA512 for PPC32, X vector is off-loaded to stack...
#
# | sha512
# | -m32
# ----------------------+-----------------------
# PPC74x0,gcc-4.0.1 | +48%
# POWER6,gcc-4.4.6 | +124%(*)
# POWER7,gcc-4.4.6 | +79%(*)
# e300,gcc-4.1.0 | +167%
#
# (*) ~1/3 of -m64 result [and ~20% better than -m32 code generated
# by xlc-12.1]
my $XOFF=$LOCALS;
my @V=map("r$_",(16..31)); # A..H
my ($s0,$s1,$t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("r$_",(0,5,6,8..12,14,15));
my ($x0,$x1)=("r3","r4"); # zaps $ctx and $inp
sub ROUND_00_15_ppc32 {
my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
$ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
$code.=<<___;
lwz $t2,`$SZ*($i%16)+($LITTLE_ENDIAN^4)`($Tbl)
xor $a0,$flo,$glo
lwz $t3,`$SZ*($i%16)+($LITTLE_ENDIAN^0)`($Tbl)
xor $a1,$fhi,$ghi
addc $hlo,$hlo,$t0 ; h+=x[i]
stw $t0,`$XOFF+0+$SZ*($i%16)`($sp) ; save x[i]
srwi $s0,$elo,$Sigma1[0]
srwi $s1,$ehi,$Sigma1[0]
and $a0,$a0,$elo
adde $hhi,$hhi,$t1
and $a1,$a1,$ehi
stw $t1,`$XOFF+4+$SZ*($i%16)`($sp)
srwi $t0,$elo,$Sigma1[1]
srwi $t1,$ehi,$Sigma1[1]
addc $hlo,$hlo,$t2 ; h+=K512[i]
insrwi $s0,$ehi,$Sigma1[0],0
insrwi $s1,$elo,$Sigma1[0],0
xor $a0,$a0,$glo ; Ch(e,f,g)
adde $hhi,$hhi,$t3
xor $a1,$a1,$ghi
insrwi $t0,$ehi,$Sigma1[1],0
insrwi $t1,$elo,$Sigma1[1],0
addc $hlo,$hlo,$a0 ; h+=Ch(e,f,g)
srwi $t2,$ehi,$Sigma1[2]-32
srwi $t3,$elo,$Sigma1[2]-32
xor $s0,$s0,$t0
xor $s1,$s1,$t1
insrwi $t2,$elo,$Sigma1[2]-32,0
insrwi $t3,$ehi,$Sigma1[2]-32,0
xor $a0,$alo,$blo ; a^b, b^c in next round
adde $hhi,$hhi,$a1
xor $a1,$ahi,$bhi
xor $s0,$s0,$t2 ; Sigma1(e)
xor $s1,$s1,$t3
srwi $t0,$alo,$Sigma0[0]
and $a2,$a2,$a0
addc $hlo,$hlo,$s0 ; h+=Sigma1(e)
and $a3,$a3,$a1
srwi $t1,$ahi,$Sigma0[0]
srwi $s0,$ahi,$Sigma0[1]-32
adde $hhi,$hhi,$s1
srwi $s1,$alo,$Sigma0[1]-32
insrwi $t0,$ahi,$Sigma0[0],0
insrwi $t1,$alo,$Sigma0[0],0
xor $a2,$a2,$blo ; Maj(a,b,c)
addc $dlo,$dlo,$hlo ; d+=h
xor $a3,$a3,$bhi
insrwi $s0,$alo,$Sigma0[1]-32,0
insrwi $s1,$ahi,$Sigma0[1]-32,0
adde $dhi,$dhi,$hhi
srwi $t2,$ahi,$Sigma0[2]-32
srwi $t3,$alo,$Sigma0[2]-32
xor $s0,$s0,$t0
addc $hlo,$hlo,$a2 ; h+=Maj(a,b,c)
xor $s1,$s1,$t1
insrwi $t2,$alo,$Sigma0[2]-32,0
insrwi $t3,$ahi,$Sigma0[2]-32,0
adde $hhi,$hhi,$a3
___
$code.=<<___ if ($i>=15);
lwz $t0,`$XOFF+0+$SZ*(($i+2)%16)`($sp)
lwz $t1,`$XOFF+4+$SZ*(($i+2)%16)`($sp)
___
$code.=<<___ if ($i<15 && !$LITTLE_ENDIAN);
lwz $t1,`$SZ*($i+1)+0`($inp)
lwz $t0,`$SZ*($i+1)+4`($inp)
___
$code.=<<___ if ($i<15 && $LITTLE_ENDIAN);
lwz $a2,`$SZ*($i+1)+0`($inp)
lwz $a3,`$SZ*($i+1)+4`($inp)
rotlwi $t1,$a2,8
rotlwi $t0,$a3,8
rlwimi $t1,$a2,24,0,7
rlwimi $t0,$a3,24,0,7
rlwimi $t1,$a2,24,16,23
rlwimi $t0,$a3,24,16,23
___
$code.=<<___;
xor $s0,$s0,$t2 ; Sigma0(a)
xor $s1,$s1,$t3
addc $hlo,$hlo,$s0 ; h+=Sigma0(a)
adde $hhi,$hhi,$s1
___
$code.=<<___ if ($i==15);
lwz $x0,`$XOFF+0+$SZ*(($i+1)%16)`($sp)
lwz $x1,`$XOFF+4+$SZ*(($i+1)%16)`($sp)
___
}
sub ROUND_16_xx_ppc32 {
my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
$ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
$code.=<<___;
srwi $s0,$t0,$sigma0[0]
srwi $s1,$t1,$sigma0[0]
srwi $t2,$t0,$sigma0[1]
srwi $t3,$t1,$sigma0[1]
insrwi $s0,$t1,$sigma0[0],0
insrwi $s1,$t0,$sigma0[0],0
srwi $a0,$t0,$sigma0[2]
insrwi $t2,$t1,$sigma0[1],0
insrwi $t3,$t0,$sigma0[1],0
insrwi $a0,$t1,$sigma0[2],0
xor $s0,$s0,$t2
lwz $t2,`$XOFF+0+$SZ*(($i+14)%16)`($sp)
srwi $a1,$t1,$sigma0[2]
xor $s1,$s1,$t3
lwz $t3,`$XOFF+4+$SZ*(($i+14)%16)`($sp)
xor $a0,$a0,$s0
srwi $s0,$t2,$sigma1[0]
xor $a1,$a1,$s1
srwi $s1,$t3,$sigma1[0]
addc $x0,$x0,$a0 ; x[i]+=sigma0(x[i+1])
srwi $a0,$t3,$sigma1[1]-32
insrwi $s0,$t3,$sigma1[0],0
insrwi $s1,$t2,$sigma1[0],0
adde $x1,$x1,$a1
srwi $a1,$t2,$sigma1[1]-32
insrwi $a0,$t2,$sigma1[1]-32,0
srwi $t2,$t2,$sigma1[2]
insrwi $a1,$t3,$sigma1[1]-32,0
insrwi $t2,$t3,$sigma1[2],0
xor $s0,$s0,$a0
lwz $a0,`$XOFF+0+$SZ*(($i+9)%16)`($sp)
srwi $t3,$t3,$sigma1[2]
xor $s1,$s1,$a1
lwz $a1,`$XOFF+4+$SZ*(($i+9)%16)`($sp)
xor $s0,$s0,$t2
addc $x0,$x0,$a0 ; x[i]+=x[i+9]
xor $s1,$s1,$t3
adde $x1,$x1,$a1
addc $x0,$x0,$s0 ; x[i]+=sigma1(x[i+14])
adde $x1,$x1,$s1
___
($t0,$t1,$x0,$x1) = ($x0,$x1,$t0,$t1);
&ROUND_00_15_ppc32(@_);
}
$code.=<<___;
.align 4
Lsha2_block_private:
___
$code.=<<___ if (!$LITTLE_ENDIAN);
lwz $t1,0($inp)
xor $a2,@V[3],@V[5] ; B^C, magic seed
lwz $t0,4($inp)
xor $a3,@V[2],@V[4]
___
$code.=<<___ if ($LITTLE_ENDIAN);
lwz $a1,0($inp)
xor $a2,@V[3],@V[5] ; B^C, magic seed
lwz $a0,4($inp)
xor $a3,@V[2],@V[4]
rotlwi $t1,$a1,8
rotlwi $t0,$a0,8
rlwimi $t1,$a1,24,0,7
rlwimi $t0,$a0,24,0,7
rlwimi $t1,$a1,24,16,23
rlwimi $t0,$a0,24,16,23
___
for($i=0;$i<16;$i++) {
&ROUND_00_15_ppc32($i,@V);
unshift(@V,pop(@V)); unshift(@V,pop(@V));
($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
}
$code.=<<___;
li $a0,`$rounds/16-1`
mtctr $a0
.align 4
Lrounds:
addi $Tbl,$Tbl,`16*$SZ`
___
for(;$i<32;$i++) {
&ROUND_16_xx_ppc32($i,@V);
unshift(@V,pop(@V)); unshift(@V,pop(@V));
($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
}
$code.=<<___;
bdnz- Lrounds
$POP $ctx,`$FRAME-$SIZE_T*22`($sp)
$POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
$POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
lwz $t0,`$LITTLE_ENDIAN^0`($ctx)
lwz $t1,`$LITTLE_ENDIAN^4`($ctx)
lwz $t2,`$LITTLE_ENDIAN^8`($ctx)
lwz $t3,`$LITTLE_ENDIAN^12`($ctx)
lwz $a0,`$LITTLE_ENDIAN^16`($ctx)
lwz $a1,`$LITTLE_ENDIAN^20`($ctx)
lwz $a2,`$LITTLE_ENDIAN^24`($ctx)
addc @V[1],@V[1],$t1
lwz $a3,`$LITTLE_ENDIAN^28`($ctx)
adde @V[0],@V[0],$t0
lwz $t0,`$LITTLE_ENDIAN^32`($ctx)
addc @V[3],@V[3],$t3
lwz $t1,`$LITTLE_ENDIAN^36`($ctx)
adde @V[2],@V[2],$t2
lwz $t2,`$LITTLE_ENDIAN^40`($ctx)
addc @V[5],@V[5],$a1
lwz $t3,`$LITTLE_ENDIAN^44`($ctx)
adde @V[4],@V[4],$a0
lwz $a0,`$LITTLE_ENDIAN^48`($ctx)
addc @V[7],@V[7],$a3
lwz $a1,`$LITTLE_ENDIAN^52`($ctx)
adde @V[6],@V[6],$a2
lwz $a2,`$LITTLE_ENDIAN^56`($ctx)
addc @V[9],@V[9],$t1
lwz $a3,`$LITTLE_ENDIAN^60`($ctx)
adde @V[8],@V[8],$t0
stw @V[0],`$LITTLE_ENDIAN^0`($ctx)
stw @V[1],`$LITTLE_ENDIAN^4`($ctx)
addc @V[11],@V[11],$t3
stw @V[2],`$LITTLE_ENDIAN^8`($ctx)
stw @V[3],`$LITTLE_ENDIAN^12`($ctx)
adde @V[10],@V[10],$t2
stw @V[4],`$LITTLE_ENDIAN^16`($ctx)
stw @V[5],`$LITTLE_ENDIAN^20`($ctx)
addc @V[13],@V[13],$a1
stw @V[6],`$LITTLE_ENDIAN^24`($ctx)
stw @V[7],`$LITTLE_ENDIAN^28`($ctx)
adde @V[12],@V[12],$a0
stw @V[8],`$LITTLE_ENDIAN^32`($ctx)
stw @V[9],`$LITTLE_ENDIAN^36`($ctx)
addc @V[15],@V[15],$a3
stw @V[10],`$LITTLE_ENDIAN^40`($ctx)
stw @V[11],`$LITTLE_ENDIAN^44`($ctx)
adde @V[14],@V[14],$a2
stw @V[12],`$LITTLE_ENDIAN^48`($ctx)
stw @V[13],`$LITTLE_ENDIAN^52`($ctx)
stw @V[14],`$LITTLE_ENDIAN^56`($ctx)
stw @V[15],`$LITTLE_ENDIAN^60`($ctx)
addi $inp,$inp,`16*$SZ` ; advance inp
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
$UCMP $inp,$num
bne Lsha2_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.size $func,.-$func
___
}
# Ugly hack here, because PPC assembler syntax seem to vary too
# much from platforms to platform...
$code.=<<___;
.align 6
LPICmeup:
mflr r0
bcl 20,31,\$+4
mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
addi $Tbl,$Tbl,`64-8`
mtlr r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,322 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256/512 block procedures for s390x.
# April 2007.
#
# sha256_block_data_order is reportedly >3 times faster than gcc 3.3
# generated code (must be a bug in compiler, as improvement is
# "pathologically" high, in particular in comparison to other SHA
# modules). But the real twist is that it detects if hardware support
# for SHA256 is available and in such case utilizes it. Then the
# performance can reach >6.5x of assembler one for larger chunks.
#
# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
# January 2009.
#
# Add support for hardware SHA512 and reschedule instructions to
# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
# than software.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. On z900 SHA256 was measured to
# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
$t0="%r0";
$t1="%r1";
$ctx="%r2"; $t2="%r2";
$inp="%r3";
$len="%r4"; # used as index in inner loop
$A="%r5";
$B="%r6";
$C="%r7";
$D="%r8";
$E="%r9";
$F="%r10";
$G="%r11";
$H="%r12"; @V=($A,$B,$C,$D,$E,$F,$G,$H);
$tbl="%r13";
$T1="%r14";
$sp="%r15";
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
if ($output =~ /512/) {
$label="512";
$SZ=8;
$LD="lg"; # load from memory
$ST="stg"; # store to memory
$ADD="alg"; # add with memory operand
$ROT="rllg"; # rotate left
$SHR="srlg"; # logical right shift [see even at the end]
@Sigma0=(25,30,36);
@Sigma1=(23,46,50);
@sigma0=(56,63, 7);
@sigma1=( 3,45, 6);
$rounds=80;
$kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
} else {
$label="256";
$SZ=4;
$LD="llgf"; # load from memory
$ST="st"; # store to memory
$ADD="al"; # add with memory operand
$ROT="rll"; # rotate left
$SHR="srl"; # logical right shift
@Sigma0=(10,19,30);
@Sigma1=( 7,21,26);
@sigma0=(14,25, 3);
@sigma1=(13,15,10);
$rounds=64;
$kimdfunc=2; # magic function code for kimd instruction
}
$Func="sha${label}_block_data_order";
$Table="K${label}";
$stdframe=16*$SIZE_T+4*8;
$frame=$stdframe+16*$SZ;
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
$LD $T1,`$i*$SZ`($inp) ### $i
___
$code.=<<___;
$ROT $t0,$e,$Sigma1[0]
$ROT $t1,$e,$Sigma1[1]
lgr $t2,$f
xgr $t0,$t1
$ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
xgr $t2,$g
$ST $T1,`$stdframe+$SZ*($i%16)`($sp)
xgr $t0,$t1 # Sigma1(e)
algr $T1,$h # T1+=h
ngr $t2,$e
lgr $t1,$a
algr $T1,$t0 # T1+=Sigma1(e)
$ROT $h,$a,$Sigma0[0]
xgr $t2,$g # Ch(e,f,g)
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
$ROT $t0,$a,$Sigma0[1]
algr $T1,$t2 # T1+=Ch(e,f,g)
ogr $t1,$b
xgr $h,$t0
lgr $t2,$a
ngr $t1,$c
$ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
xgr $h,$t0 # h=Sigma0(a)
ngr $t2,$b
algr $h,$T1 # h+=T1
ogr $t2,$t1 # Maj(a,b,c)
algr $d,$T1 # d+=T1
algr $h,$t2 # h+=Maj(a,b,c)
___
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
$LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i
$LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
$ROT $t0,$T1,$sigma0[0]
$SHR $T1,$sigma0[2]
$ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
xgr $T1,$t0
$ROT $t0,$t1,$sigma1[0]
xgr $T1,$t2 # sigma0(X[i+1])
$SHR $t1,$sigma1[2]
$ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i]
xgr $t1,$t0
$ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
$ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
xgr $t1,$t0 # sigma1(X[i+14])
algr $T1,$t1 # +=sigma1(X[i+14])
___
&BODY_00_15(@_);
}
$code.=<<___;
.text
.align 64
.type $Table,\@object
$Table:
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
___
$code.=<<___;
.size $Table,.-$Table
.globl $Func
.type $Func,\@function
$Func:
sllg $len,$len,`log(16*$SZ)/log(2)`
___
$code.=<<___ if ($kimdfunc);
larl %r1,OPENSSL_s390xcap_P
lg %r0,0(%r1)
tmhl %r0,0x4000 # check for message-security assist
jz .Lsoftware
lghi %r0,0
la %r1,`2*$SIZE_T`($sp)
.long 0xb93e0002 # kimd %r0,%r2
lg %r0,`2*$SIZE_T`($sp)
tmhh %r0,`0x8000>>$kimdfunc`
jz .Lsoftware
lghi %r0,$kimdfunc
lgr %r1,$ctx
lgr %r2,$inp
lgr %r3,$len
.long 0xb93e0002 # kimd %r0,%r2
brc 1,.-4 # pay attention to "partial completion"
br %r14
.align 16
.Lsoftware:
___
$code.=<<___;
lghi %r1,-$frame
la $len,0($len,$inp)
stm${g} $ctx,%r15,`2*$SIZE_T`($sp)
lgr %r0,$sp
la $sp,0(%r1,$sp)
st${g} %r0,0($sp)
larl $tbl,$Table
$LD $A,`0*$SZ`($ctx)
$LD $B,`1*$SZ`($ctx)
$LD $C,`2*$SZ`($ctx)
$LD $D,`3*$SZ`($ctx)
$LD $E,`4*$SZ`($ctx)
$LD $F,`5*$SZ`($ctx)
$LD $G,`6*$SZ`($ctx)
$LD $H,`7*$SZ`($ctx)
.Lloop:
lghi $len,0
___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
aghi $len,`16*$SZ`
lghi $t0,`($rounds-16)*$SZ`
clgr $len,$t0
jne .Lrounds_16_xx
l${g} $ctx,`$frame+2*$SIZE_T`($sp)
la $inp,`16*$SZ`($inp)
$ADD $A,`0*$SZ`($ctx)
$ADD $B,`1*$SZ`($ctx)
$ADD $C,`2*$SZ`($ctx)
$ADD $D,`3*$SZ`($ctx)
$ADD $E,`4*$SZ`($ctx)
$ADD $F,`5*$SZ`($ctx)
$ADD $G,`6*$SZ`($ctx)
$ADD $H,`7*$SZ`($ctx)
$ST $A,`0*$SZ`($ctx)
$ST $B,`1*$SZ`($ctx)
$ST $C,`2*$SZ`($ctx)
$ST $D,`3*$SZ`($ctx)
$ST $E,`4*$SZ`($ctx)
$ST $F,`5*$SZ`($ctx)
$ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx)
cl${g} $inp,`$frame+4*$SIZE_T`($sp)
jne .Lloop
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size $Func,.-$Func
.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
.comm OPENSSL_s390xcap_P,16,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
# unlike 32-bit shift 64-bit one takes three arguments
$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
print $code;
close STDOUT;

View File

@@ -0,0 +1,850 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
# ====================================================================
# SHA256 performance improvement over compiler generated code varies
# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
# build]. Just like in SHA1 module I aim to ensure scalability on
# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
# SHA512 on pre-T1 UltraSPARC.
#
# Performance is >75% better than 64-bit code generated by Sun C and
# over 2x than 32-bit code. X[16] resides on stack, but access to it
# is scheduled for L2 latency and staged through 32 least significant
# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
# good [optimal coefficient is 50%].
#
# SHA512 on UltraSPARC T1.
#
# It's not any faster than 64-bit code generated by Sun C 5.8. This is
# because 64-bit code generator has the advantage of using 64-bit
# loads(*) to access X[16], which I consciously traded for 32-/64-bit
# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
# code by 60%, not to mention that it doesn't suffer from severe decay
# when running 4 times physical cores threads and that it leaves gcc
# [3.4] behind by over 4x factor! If compared to SHA256, single thread
# performance is only 10% better, but overall throughput for maximum
# amount of threads for given CPU exceeds corresponding one of SHA256
# by 30% [again, optimal coefficient is 50%].
#
# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
# in-order, i.e. load instruction has to complete prior next
# instruction in given thread is executed, even if the latter is
# not dependent on load result! This means that on T1 two 32-bit
# loads are always slower than one 64-bit load. Once again this
# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
# 2x32-bit loads can be as fast as 1x64-bit ones.
#
# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
# which is 9.3x/11.1x faster than software. Multi-process benchmark
# saturates at 11.5x single-process result on 8-core processor, or
# ~11/16GBps per 2.85GHz socket.
$output=shift;
open STDOUT,">$output";
if ($output =~ /512/) {
$label="512";
$SZ=8;
$LD="ldx"; # load from memory
$ST="stx"; # store to memory
$SLL="sllx"; # shift left logical
$SRL="srlx"; # shift right logical
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=( 7, 1, 8); # right shift first
@sigma1=( 6,19,61); # right shift first
$lastK=0x817;
$rounds=80;
$align=4;
$locals=16*$SZ; # X[16]
$A="%o0";
$B="%o1";
$C="%o2";
$D="%o3";
$E="%o4";
$F="%o5";
$G="%g1";
$H="%o7";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
} else {
$label="256";
$SZ=4;
$LD="ld"; # load from memory
$ST="st"; # store to memory
$SLL="sll"; # shift left logical
$SRL="srl"; # shift right logical
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 3, 7,18); # right shift first
@sigma1=(10,17,19); # right shift first
$lastK=0x8f2;
$rounds=64;
$align=8;
$locals=0; # X[16] is register resident
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
$F="%l5";
$G="%l6";
$H="%l7";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
}
$T1="%g2";
$tmp0="%g3";
$tmp1="%g4";
$tmp2="%g5";
$ctx="%i0";
$inp="%i1";
$len="%i2";
$Ktbl="%i3";
$tmp31="%i4";
$tmp32="%i5";
########### SHA256
$Xload = sub {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
if ($i==0) {
$code.=<<___;
ldx [$inp+0],@X[0]
ldx [$inp+16],@X[2]
ldx [$inp+32],@X[4]
ldx [$inp+48],@X[6]
ldx [$inp+8],@X[1]
ldx [$inp+24],@X[3]
subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
ldx [$inp+40],@X[5]
bz,pt %icc,.Laligned
ldx [$inp+56],@X[7]
sllx @X[0],$tmp31,@X[0]
ldx [$inp+64],$T1
___
for($j=0;$j<7;$j++)
{ $code.=<<___;
srlx @X[$j+1],$tmp32,$tmp1
sllx @X[$j+1],$tmp31,@X[$j+1]
or $tmp1,@X[$j],@X[$j]
___
}
$code.=<<___;
srlx $T1,$tmp32,$T1
or $T1,@X[7],@X[7]
.Laligned:
___
}
if ($i&1) {
$code.="\tadd @X[$i/2],$h,$T1\n";
} else {
$code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
}
} if ($SZ==4);
########### SHA512
$Xload = sub {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
$code.=<<___ if ($i==0);
ld [$inp+0],%l0
ld [$inp+4],%l1
ld [$inp+8],%l2
ld [$inp+12],%l3
ld [$inp+16],%l4
ld [$inp+20],%l5
ld [$inp+24],%l6
cmp $tmp31,0
ld [$inp+28],%l7
___
$code.=<<___ if ($i<15);
sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
add $tmp31,32,$tmp0
sllx @pair[0],$tmp0,$tmp1
`"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
srlx @pair[2],$tmp32,@pair[1]
or $tmp1,$tmp2,$tmp2
or @pair[1],$tmp2,$tmp2
`"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
add $h,$tmp2,$T1
$ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
___
$code.=<<___ if ($i==12);
bnz,a,pn %icc,.+8
ld [$inp+128],%l0
___
$code.=<<___ if ($i==15);
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
add $tmp31,32,$tmp0
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
sllx @pair[0],$tmp0,$tmp1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
srlx @pair[2],$tmp32,@pair[1]
or $tmp1,$tmp2,$tmp2
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
or @pair[1],$tmp2,$tmp2
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
add $h,$tmp2,$T1
$ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
___
} if ($SZ==8);
########### common
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
if ($i<16) {
&$Xload(@_);
} else {
$code.="\tadd $h,$T1,$T1\n";
}
$code.=<<___;
$SRL $e,@Sigma1[0],$h !! $i
xor $f,$g,$tmp2
$SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
and $e,$tmp2,$tmp2
$SRL $e,@Sigma1[1],$tmp0
xor $tmp1,$h,$h
$SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
xor $tmp0,$h,$h
$SRL $e,@Sigma1[2],$tmp0
xor $tmp1,$h,$h
$SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
xor $tmp0,$h,$h
xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
xor $tmp1,$h,$tmp0 ! Sigma1(e)
$SRL $a,@Sigma0[0],$h
add $tmp2,$T1,$T1
$LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
$SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
add $tmp0,$T1,$T1
$SRL $a,@Sigma0[1],$tmp0
xor $tmp1,$h,$h
$SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
xor $tmp0,$h,$h
$SRL $a,@Sigma0[2],$tmp0
xor $tmp1,$h,$h
$SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
xor $tmp0,$h,$h
xor $tmp1,$h,$h ! Sigma0(a)
or $a,$b,$tmp0
and $a,$b,$tmp1
and $c,$tmp0,$tmp0
or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
add $tmp2,$T1,$T1 ! +=K[$i]
add $tmp1,$h,$h
add $T1,$d,$d
add $T1,$h,$h
___
}
########### SHA256
$BODY_16_XX = sub {
my $i=@_[0];
my $xi;
if ($i&1) {
$xi=$tmp32;
$code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
} else {
$xi=@X[(($i+1)/2)%8];
}
$code.=<<___;
srl $xi,@sigma0[0],$T1 !! Xupdate($i)
sll $xi,`32-@sigma0[2]`,$tmp1
srl $xi,@sigma0[1],$tmp0
xor $tmp1,$T1,$T1
sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
xor $tmp0,$T1,$T1
srl $xi,@sigma0[2],$tmp0
xor $tmp1,$T1,$T1
___
if ($i&1) {
$xi=@X[(($i+14)/2)%8];
} else {
$xi=$tmp32;
$code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
}
$code.=<<___;
srl $xi,@sigma1[0],$tmp2
xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
sll $xi,`32-@sigma1[2]`,$tmp1
srl $xi,@sigma1[1],$tmp0
xor $tmp1,$tmp2,$tmp2
sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
xor $tmp0,$tmp2,$tmp2
srl $xi,@sigma1[2],$tmp0
xor $tmp1,$tmp2,$tmp2
___
if ($i&1) {
$xi=@X[($i/2)%8];
$code.=<<___;
srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
srl @X[($i/2)%8],0,$tmp0
add $tmp2,$tmp1,$tmp1
add $xi,$T1,$T1 ! +=X[i]
xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
add $tmp1,$T1,$T1
srl $T1,0,$T1
or $T1,@X[($i/2)%8],@X[($i/2)%8]
___
} else {
$xi=@X[(($i+9)/2)%8];
$code.=<<___;
srlx @X[($i/2)%8],32,$tmp1 ! X[i]
xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
add $xi,$T1,$T1 ! +=X[i+9]
add $tmp2,$tmp1,$tmp1
srl @X[($i/2)%8],0,@X[($i/2)%8]
add $tmp1,$T1,$T1
sllx $T1,32,$tmp0
or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
___
}
&BODY_00_15(@_);
} if ($SZ==4);
########### SHA512
$BODY_16_XX = sub {
my $i=@_[0];
my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
$code.=<<___;
sllx %l2,32,$tmp0 !! Xupdate($i)
or %l3,$tmp0,$tmp0
srlx $tmp0,@sigma0[0],$T1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
sllx $tmp0,`64-@sigma0[2]`,$tmp1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
srlx $tmp0,@sigma0[1],$tmp0
xor $tmp1,$T1,$T1
sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
xor $tmp0,$T1,$T1
srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
xor $tmp1,$T1,$T1
sllx %l6,32,$tmp2
xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
or %l7,$tmp2,$tmp2
srlx $tmp2,@sigma1[0],$tmp1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
sllx $tmp2,`64-@sigma1[2]`,$tmp0
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
srlx $tmp2,@sigma1[1],$tmp2
xor $tmp0,$tmp1,$tmp1
sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
xor $tmp2,$tmp1,$tmp1
srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
xor $tmp0,$tmp1,$tmp1
sllx %l4,32,$tmp0
xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
or %l5,$tmp0,$tmp0
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
sllx %l0,32,$tmp2
add $tmp1,$T1,$T1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
or %l1,$tmp2,$tmp2
add $tmp0,$T1,$T1 ! +=X[$i+9]
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
add $tmp2,$T1,$T1 ! +=X[$i]
$ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
___
&BODY_00_15(@_);
} if ($SZ==8);
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
.align 64
K${label}:
.type K${label},#object
___
if ($SZ==4) {
$code.=<<___;
.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
___
} else {
$code.=<<___;
.long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
.long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
.long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
.long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
.long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
.long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
.long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
.long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
.long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
.long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
.long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
.long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
.long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
.long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
.long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
.long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
.long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
.long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
.long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
.long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
.long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
.long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
.long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
.long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
.long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
.long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
.long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
.long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
.long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
.long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
.long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
.long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
.long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
.long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
.long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
.long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
.long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
.long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
.long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
.long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
___
}
$code.=<<___;
.size K${label},.-K${label}
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
.globl sha${label}_block_data_order
.align 32
sha${label}_block_data_order:
SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
andcc %g1, CFR_SHA${label}, %g0
be .Lsoftware
nop
___
$code.=<<___ if ($SZ==8); # SHA512
ldd [%o0 + 0x00], %f0 ! load context
ldd [%o0 + 0x08], %f2
ldd [%o0 + 0x10], %f4
ldd [%o0 + 0x18], %f6
ldd [%o0 + 0x20], %f8
ldd [%o0 + 0x28], %f10
andcc %o1, 0x7, %g0
ldd [%o0 + 0x30], %f12
bne,pn %icc, .Lhwunaligned
ldd [%o0 + 0x38], %f14
.Lhwaligned_loop:
ldd [%o1 + 0x00], %f16
ldd [%o1 + 0x08], %f18
ldd [%o1 + 0x10], %f20
ldd [%o1 + 0x18], %f22
ldd [%o1 + 0x20], %f24
ldd [%o1 + 0x28], %f26
ldd [%o1 + 0x30], %f28
ldd [%o1 + 0x38], %f30
ldd [%o1 + 0x40], %f32
ldd [%o1 + 0x48], %f34
ldd [%o1 + 0x50], %f36
ldd [%o1 + 0x58], %f38
ldd [%o1 + 0x60], %f40
ldd [%o1 + 0x68], %f42
ldd [%o1 + 0x70], %f44
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x78], %f46
add %o1, 0x80, %o1
prefetch [%o1 + 63], 20
prefetch [%o1 + 64+63], 20
.word 0x81b02860 ! SHA512
bne,pt SIZE_T_CC, .Lhwaligned_loop
nop
.Lhwfinish:
std %f0, [%o0 + 0x00] ! store context
std %f2, [%o0 + 0x08]
std %f4, [%o0 + 0x10]
std %f6, [%o0 + 0x18]
std %f8, [%o0 + 0x20]
std %f10, [%o0 + 0x28]
std %f12, [%o0 + 0x30]
retl
std %f14, [%o0 + 0x38]
.align 16
.Lhwunaligned:
alignaddr %o1, %g0, %o1
ldd [%o1 + 0x00], %f18
.Lhwunaligned_loop:
ldd [%o1 + 0x08], %f20
ldd [%o1 + 0x10], %f22
ldd [%o1 + 0x18], %f24
ldd [%o1 + 0x20], %f26
ldd [%o1 + 0x28], %f28
ldd [%o1 + 0x30], %f30
ldd [%o1 + 0x38], %f32
ldd [%o1 + 0x40], %f34
ldd [%o1 + 0x48], %f36
ldd [%o1 + 0x50], %f38
ldd [%o1 + 0x58], %f40
ldd [%o1 + 0x60], %f42
ldd [%o1 + 0x68], %f44
ldd [%o1 + 0x70], %f46
ldd [%o1 + 0x78], %f48
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x80], %f50
add %o1, 0x80, %o1
prefetch [%o1 + 63], 20
prefetch [%o1 + 64+63], 20
faligndata %f18, %f20, %f16
faligndata %f20, %f22, %f18
faligndata %f22, %f24, %f20
faligndata %f24, %f26, %f22
faligndata %f26, %f28, %f24
faligndata %f28, %f30, %f26
faligndata %f30, %f32, %f28
faligndata %f32, %f34, %f30
faligndata %f34, %f36, %f32
faligndata %f36, %f38, %f34
faligndata %f38, %f40, %f36
faligndata %f40, %f42, %f38
faligndata %f42, %f44, %f40
faligndata %f44, %f46, %f42
faligndata %f46, %f48, %f44
faligndata %f48, %f50, %f46
.word 0x81b02860 ! SHA512
bne,pt SIZE_T_CC, .Lhwunaligned_loop
for %f50, %f50, %f18 ! %f18=%f50
ba .Lhwfinish
nop
___
$code.=<<___ if ($SZ==4); # SHA256
ld [%o0 + 0x00], %f0
ld [%o0 + 0x04], %f1
ld [%o0 + 0x08], %f2
ld [%o0 + 0x0c], %f3
ld [%o0 + 0x10], %f4
ld [%o0 + 0x14], %f5
andcc %o1, 0x7, %g0
ld [%o0 + 0x18], %f6
bne,pn %icc, .Lhwunaligned
ld [%o0 + 0x1c], %f7
.Lhwloop:
ldd [%o1 + 0x00], %f8
ldd [%o1 + 0x08], %f10
ldd [%o1 + 0x10], %f12
ldd [%o1 + 0x18], %f14
ldd [%o1 + 0x20], %f16
ldd [%o1 + 0x28], %f18
ldd [%o1 + 0x30], %f20
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x38], %f22
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
.word 0x81b02840 ! SHA256
bne,pt SIZE_T_CC, .Lhwloop
nop
.Lhwfinish:
st %f0, [%o0 + 0x00] ! store context
st %f1, [%o0 + 0x04]
st %f2, [%o0 + 0x08]
st %f3, [%o0 + 0x0c]
st %f4, [%o0 + 0x10]
st %f5, [%o0 + 0x14]
st %f6, [%o0 + 0x18]
retl
st %f7, [%o0 + 0x1c]
.align 8
.Lhwunaligned:
alignaddr %o1, %g0, %o1
ldd [%o1 + 0x00], %f10
.Lhwunaligned_loop:
ldd [%o1 + 0x08], %f12
ldd [%o1 + 0x10], %f14
ldd [%o1 + 0x18], %f16
ldd [%o1 + 0x20], %f18
ldd [%o1 + 0x28], %f20
ldd [%o1 + 0x30], %f22
ldd [%o1 + 0x38], %f24
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x40], %f26
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
faligndata %f10, %f12, %f8
faligndata %f12, %f14, %f10
faligndata %f14, %f16, %f12
faligndata %f16, %f18, %f14
faligndata %f18, %f20, %f16
faligndata %f20, %f22, %f18
faligndata %f22, %f24, %f20
faligndata %f24, %f26, %f22
.word 0x81b02840 ! SHA256
bne,pt SIZE_T_CC, .Lhwunaligned_loop
for %f26, %f26, %f10 ! %f10=%f26
ba .Lhwfinish
nop
___
$code.=<<___;
.align 16
.Lsoftware:
save %sp,-STACK_FRAME-$locals,%sp
and $inp,`$align-1`,$tmp31
sllx $len,`log(16*$SZ)/log(2)`,$len
andn $inp,`$align-1`,$inp
sll $tmp31,3,$tmp31
add $inp,$len,$len
___
$code.=<<___ if ($SZ==8); # SHA512
mov 32,$tmp32
sub $tmp32,$tmp31,$tmp32
___
$code.=<<___;
.Lpic: call .+8
add %o7,K${label}-.Lpic,$Ktbl
$LD [$ctx+`0*$SZ`],$A
$LD [$ctx+`1*$SZ`],$B
$LD [$ctx+`2*$SZ`],$C
$LD [$ctx+`3*$SZ`],$D
$LD [$ctx+`4*$SZ`],$E
$LD [$ctx+`5*$SZ`],$F
$LD [$ctx+`6*$SZ`],$G
$LD [$ctx+`7*$SZ`],$H
.Lloop:
___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".L16_xx:\n";
for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
and $tmp2,0xfff,$tmp2
cmp $tmp2,$lastK
bne .L16_xx
add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
___
$code.=<<___ if ($SZ==4); # SHA256
$LD [$ctx+`0*$SZ`],@X[0]
$LD [$ctx+`1*$SZ`],@X[1]
$LD [$ctx+`2*$SZ`],@X[2]
$LD [$ctx+`3*$SZ`],@X[3]
$LD [$ctx+`4*$SZ`],@X[4]
$LD [$ctx+`5*$SZ`],@X[5]
$LD [$ctx+`6*$SZ`],@X[6]
$LD [$ctx+`7*$SZ`],@X[7]
add $A,@X[0],$A
$ST $A,[$ctx+`0*$SZ`]
add $B,@X[1],$B
$ST $B,[$ctx+`1*$SZ`]
add $C,@X[2],$C
$ST $C,[$ctx+`2*$SZ`]
add $D,@X[3],$D
$ST $D,[$ctx+`3*$SZ`]
add $E,@X[4],$E
$ST $E,[$ctx+`4*$SZ`]
add $F,@X[5],$F
$ST $F,[$ctx+`5*$SZ`]
add $G,@X[6],$G
$ST $G,[$ctx+`6*$SZ`]
add $H,@X[7],$H
$ST $H,[$ctx+`7*$SZ`]
___
$code.=<<___ if ($SZ==8); # SHA512
ld [$ctx+`0*$SZ+0`],%l0
ld [$ctx+`0*$SZ+4`],%l1
ld [$ctx+`1*$SZ+0`],%l2
ld [$ctx+`1*$SZ+4`],%l3
ld [$ctx+`2*$SZ+0`],%l4
ld [$ctx+`2*$SZ+4`],%l5
ld [$ctx+`3*$SZ+0`],%l6
sllx %l0,32,$tmp0
ld [$ctx+`3*$SZ+4`],%l7
sllx %l2,32,$tmp1
or %l1,$tmp0,$tmp0
or %l3,$tmp1,$tmp1
add $tmp0,$A,$A
add $tmp1,$B,$B
$ST $A,[$ctx+`0*$SZ`]
sllx %l4,32,$tmp2
$ST $B,[$ctx+`1*$SZ`]
sllx %l6,32,$T1
or %l5,$tmp2,$tmp2
or %l7,$T1,$T1
add $tmp2,$C,$C
$ST $C,[$ctx+`2*$SZ`]
add $T1,$D,$D
$ST $D,[$ctx+`3*$SZ`]
ld [$ctx+`4*$SZ+0`],%l0
ld [$ctx+`4*$SZ+4`],%l1
ld [$ctx+`5*$SZ+0`],%l2
ld [$ctx+`5*$SZ+4`],%l3
ld [$ctx+`6*$SZ+0`],%l4
ld [$ctx+`6*$SZ+4`],%l5
ld [$ctx+`7*$SZ+0`],%l6
sllx %l0,32,$tmp0
ld [$ctx+`7*$SZ+4`],%l7
sllx %l2,32,$tmp1
or %l1,$tmp0,$tmp0
or %l3,$tmp1,$tmp1
add $tmp0,$E,$E
add $tmp1,$F,$F
$ST $E,[$ctx+`4*$SZ`]
sllx %l4,32,$tmp2
$ST $F,[$ctx+`5*$SZ`]
sllx %l6,32,$T1
or %l5,$tmp2,$tmp2
or %l7,$T1,$T1
add $tmp2,$G,$G
$ST $G,[$ctx+`6*$SZ`]
add $T1,$H,$H
$ST $H,[$ctx+`7*$SZ`]
___
$code.=<<___;
add $inp,`16*$SZ`,$inp ! advance inp
cmp $inp,$len
bne SIZE_T_CC,.Lloop
sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
ret
restore
.type sha${label}_block_data_order,#function
.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my $ref,$opf;
my %visopf = ( "faligndata" => 0x048,
"for" => 0x07c );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%f([0-9]{1,2})/);
$_=$1;
if ($1>=32) {
return $ref if ($1&1);
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
sub unalignaddr {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my $ref="$mnemonic\t$rs1,$rs2,$rd";
foreach ($rs1,$rs2,$rd) {
if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
else { return $ref; }
}
return sprintf ".word\t0x%08x !%s",
0x81b00300|$rd<<25|$rs1<<14|$rs2,
$ref;
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
&unvis($1,$2,$3,$4)
/ge;
s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unalignaddr($1,$2,$3,$4)
/ge;
print $_,"\n";
}
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,424 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256/512 for PowerISA v2.07.
#
# Accurate performance measurements are problematic, because it's
# always virtualized setup with possibly throttled processor.
# Relative comparison is therefore more informative. This module is
# ~60% faster than integer-only sha512-ppc.pl. To anchor to something
# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
# result is degree of computational resources' utilization. POWER8 is
# "massively multi-threaded chip" and difference between single- and
# maximum multi-process benchmark results tells that utlization is
# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
# to single-process one, given that all threads end up on the same
# physical core.
$flavour=shift;
$output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$POP="ld";
$PUSH="std";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$POP="lwz";
$PUSH="stw";
} else { die "nonsense $flavour"; }
$LENDIAN=($flavour=~/le/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
if ($output =~ /512/) {
$bits=512;
$SZ=8;
$sz="d";
$rounds=80;
} else {
$bits=256;
$SZ=4;
$sz="w";
$rounds=64;
}
$func="sha${bits}_block_p8";
$FRAME=8*$SIZE_T;
$sp ="r1";
$toc="r2";
$ctx="r3";
$inp="r4";
$num="r5";
$Tbl="r6";
$idx="r7";
$lrsave="r8";
$offload="r11";
$vrsave="r12";
($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
$x00=0 if ($flavour =~ /osx/);
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
@X=map("v$_",(8..23));
($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
sub ROUND {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)%16;
$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
lvx_u @X[$i+1],0,$inp ; load X[i] in advance
addi $inp,$inp,16
___
$code.=<<___ if ($i<16 && ($i%(16/$SZ)));
vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ
___
$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
vperm @X[$i],@X[$i],@X[$i],$lemask
___
$code.=<<___;
`"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)`
vsel $Func,$g,$f,$e ; Ch(e,f,g)
vshasigma${sz} $S1,$e,1,15 ; Sigma1(e)
vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
vshasigma${sz} $S0,$a,1,0 ; Sigma0(a)
`"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)`
vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
vxor $Func,$a,$b
`"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)`
vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e)
vsel $Func,$b,$c,$Func ; Maj(a,b,c)
vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
vaddu${sz}m $d,$d,$h ; d+=h
vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c)
`"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)`
lvx $Ki,$idx,$Tbl ; load next K[i]
addi $idx,$idx,16
vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c)
`"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)`
___
}
$code=<<___;
.machine "any"
.text
.globl $func
.align 6
$func:
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
mflr $lrsave
li r10,`$FRAME+8*16+15`
li r11,`$FRAME+8*16+31`
stvx v20,r10,$sp # ABI says so
addi r10,r10,32
mfspr $vrsave,256
stvx v21,r11,$sp
addi r11,r11,32
stvx v22,r10,$sp
addi r10,r10,32
stvx v23,r11,$sp
addi r11,r11,32
stvx v24,r10,$sp
addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
addi r11,r11,32
stvx v28,r10,$sp
addi r10,r10,32
stvx v29,r11,$sp
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
li r11,-1
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
li $x70,0x70
$PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
mtspr 256,r11
bl LPICmeup
addi $offload,$sp,$FRAME+15
___
$code.=<<___ if ($LENDIAN);
li $idx,8
lvsl $lemask,0,$idx
vspltisb $Ki,0x0f
vxor $lemask,$lemask,$Ki
___
$code.=<<___ if ($SZ==4);
lvx_4w $A,$x00,$ctx
lvx_4w $E,$x10,$ctx
vsldoi $B,$A,$A,4 # unpack
vsldoi $C,$A,$A,8
vsldoi $D,$A,$A,12
vsldoi $F,$E,$E,4
vsldoi $G,$E,$E,8
vsldoi $H,$E,$E,12
___
$code.=<<___ if ($SZ==8);
lvx_u $A,$x00,$ctx
lvx_u $C,$x10,$ctx
lvx_u $E,$x20,$ctx
vsldoi $B,$A,$A,8 # unpack
lvx_u $G,$x30,$ctx
vsldoi $D,$C,$C,8
vsldoi $F,$E,$E,8
vsldoi $H,$G,$G,8
___
$code.=<<___;
li r0,`($rounds-16)/16` # inner loop counter
b Loop
.align 5
Loop:
lvx $Ki,$x00,$Tbl
li $idx,16
lvx_u @X[0],0,$inp
addi $inp,$inp,16
stvx $A,$x00,$offload # offload $A-$H
stvx $B,$x10,$offload
stvx $C,$x20,$offload
stvx $D,$x30,$offload
stvx $E,$x40,$offload
stvx $F,$x50,$offload
stvx $G,$x60,$offload
stvx $H,$x70,$offload
vaddu${sz}m $H,$H,$Ki # h+K[i]
lvx $Ki,$idx,$Tbl
addi $idx,$idx,16
___
for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mtctr r0
b L16_xx
.align 5
L16_xx:
___
for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bdnz L16_xx
lvx @X[2],$x00,$offload
subic. $num,$num,1
lvx @X[3],$x10,$offload
vaddu${sz}m $A,$A,@X[2]
lvx @X[4],$x20,$offload
vaddu${sz}m $B,$B,@X[3]
lvx @X[5],$x30,$offload
vaddu${sz}m $C,$C,@X[4]
lvx @X[6],$x40,$offload
vaddu${sz}m $D,$D,@X[5]
lvx @X[7],$x50,$offload
vaddu${sz}m $E,$E,@X[6]
lvx @X[8],$x60,$offload
vaddu${sz}m $F,$F,@X[7]
lvx @X[9],$x70,$offload
vaddu${sz}m $G,$G,@X[8]
vaddu${sz}m $H,$H,@X[9]
bne Loop
___
$code.=<<___ if ($SZ==4);
lvx @X[0],$idx,$Tbl
addi $idx,$idx,16
vperm $A,$A,$B,$Ki # pack the answer
lvx @X[1],$idx,$Tbl
vperm $E,$E,$F,$Ki
vperm $A,$A,$C,@X[0]
vperm $E,$E,$G,@X[0]
vperm $A,$A,$D,@X[1]
vperm $E,$E,$H,@X[1]
stvx_4w $A,$x00,$ctx
stvx_4w $E,$x10,$ctx
___
$code.=<<___ if ($SZ==8);
vperm $A,$A,$B,$Ki # pack the answer
vperm $C,$C,$D,$Ki
vperm $E,$E,$F,$Ki
vperm $G,$G,$H,$Ki
stvx_u $A,$x00,$ctx
stvx_u $C,$x10,$ctx
stvx_u $E,$x20,$ctx
stvx_u $G,$x30,$ctx
___
$code.=<<___;
li r10,`$FRAME+8*16+15`
mtlr $lrsave
li r11,`$FRAME+8*16+31`
mtspr 256,$vrsave
lvx v20,r10,$sp # ABI says so
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
blr
.long 0
.byte 0,12,4,1,0x80,6,3,0
.long 0
.size $func,.-$func
___
# Ugly hack here, because PPC assembler syntax seem to vary too
# much from platforms to platform...
$code.=<<___;
.align 6
LPICmeup:
mflr r0
bcl 20,31,\$+4
mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
addi $Tbl,$Tbl,`64-8`
mtlr r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
___
if ($SZ==8) {
local *table = sub {
foreach(@_) { $code.=".quad $_,$_\n"; }
};
table(
"0x428a2f98d728ae22","0x7137449123ef65cd",
"0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
"0x3956c25bf348b538","0x59f111f1b605d019",
"0x923f82a4af194f9b","0xab1c5ed5da6d8118",
"0xd807aa98a3030242","0x12835b0145706fbe",
"0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
"0x72be5d74f27b896f","0x80deb1fe3b1696b1",
"0x9bdc06a725c71235","0xc19bf174cf692694",
"0xe49b69c19ef14ad2","0xefbe4786384f25e3",
"0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
"0x2de92c6f592b0275","0x4a7484aa6ea6e483",
"0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
"0x983e5152ee66dfab","0xa831c66d2db43210",
"0xb00327c898fb213f","0xbf597fc7beef0ee4",
"0xc6e00bf33da88fc2","0xd5a79147930aa725",
"0x06ca6351e003826f","0x142929670a0e6e70",
"0x27b70a8546d22ffc","0x2e1b21385c26c926",
"0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
"0x650a73548baf63de","0x766a0abb3c77b2a8",
"0x81c2c92e47edaee6","0x92722c851482353b",
"0xa2bfe8a14cf10364","0xa81a664bbc423001",
"0xc24b8b70d0f89791","0xc76c51a30654be30",
"0xd192e819d6ef5218","0xd69906245565a910",
"0xf40e35855771202a","0x106aa07032bbd1b8",
"0x19a4c116b8d2d0c8","0x1e376c085141ab53",
"0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
"0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
"0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
"0x748f82ee5defb2fc","0x78a5636f43172f60",
"0x84c87814a1f0ab72","0x8cc702081a6439ec",
"0x90befffa23631e28","0xa4506cebde82bde9",
"0xbef9a3f7b2c67915","0xc67178f2e372532b",
"0xca273eceea26619c","0xd186b8c721c0c207",
"0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
"0x06f067aa72176fba","0x0a637dc5a2c898a6",
"0x113f9804bef90dae","0x1b710b35131c471b",
"0x28db77f523047d84","0x32caab7b40c72493",
"0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
"0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
"0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
$code.=<<___ if (!$LENDIAN);
.quad 0x0001020304050607,0x1011121314151617
___
$code.=<<___ if ($LENDIAN); # quad-swapped
.quad 0x1011121314151617,0x0001020304050607
___
} else {
local *table = sub {
foreach(@_) { $code.=".long $_,$_,$_,$_\n"; }
};
table(
"0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
"0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
"0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
"0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
"0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
"0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
"0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
"0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
"0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
"0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
"0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
"0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
"0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
"0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
"0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
"0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
$code.=<<___ if (!$LENDIAN);
.long 0x00010203,0x10111213,0x10111213,0x10111213
.long 0x00010203,0x04050607,0x10111213,0x10111213
.long 0x00010203,0x04050607,0x08090a0b,0x10111213
___
$code.=<<___ if ($LENDIAN); # word-swapped
.long 0x10111213,0x10111213,0x10111213,0x00010203
.long 0x10111213,0x10111213,0x04050607,0x00010203
.long 0x10111213,0x08090a0b,0x04050607,0x00010203
___
}
$code.=<<___;
.asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;