Initial Commit

This commit is contained in:
root
2017-02-25 23:55:24 +01:00
commit 1fe2e8ab62
4868 changed files with 1487355 additions and 0 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,919 @@
#!/usr/bin/env perl
# ====================================================================
# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
# <appro@openssl.org>. The module is licensed under 2-clause BSD
# license. October 2012. All rights reserved.
# ====================================================================
######################################################################
# AES for SPARC T4.
#
# AES round instructions complete in 3 cycles and can be issued every
# cycle. It means that round calculations should take 4*rounds cycles,
# because any given round instruction depends on result of *both*
# previous instructions:
#
# |0 |1 |2 |3 |4
# |01|01|01|
# |23|23|23|
# |01|01|...
# |23|...
#
# Provided that fxor [with IV] takes 3 cycles to complete, critical
# path length for CBC encrypt would be 3+4*rounds, or in other words
# it should process one byte in at least (3+4*rounds)/16 cycles. This
# estimate doesn't account for "collateral" instructions, such as
# fetching input from memory, xor-ing it with zero-round key and
# storing the result. Yet, *measured* performance [for data aligned
# at 64-bit boundary!] deviates from this equation by less than 0.5%:
#
# 128-bit key 192- 256-
# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
# (*) numbers after slash are for
# misaligned data.
#
# Out-of-order execution logic managed to fully overlap "collateral"
# instructions with those on critical path. Amazing!
#
# As with Intel AES-NI, question is if it's possible to improve
# performance of parallelizeable modes by interleaving round
# instructions. Provided round instruction latency and throughput
# optimal interleave factor is 2. But can we expect 2x performance
# improvement? Well, as round instructions can be issued one per
# cycle, they don't saturate the 2-way issue pipeline and therefore
# there is room for "collateral" calculations... Yet, 2x speed-up
# over CBC encrypt remains unattaintable:
#
# 128-bit key 192- 256-
# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
# (*) numbers after slash are for
# misaligned data.
#
# Estimates based on amount of instructions under assumption that
# round instructions are not pairable with any other instruction
# suggest that latter is the actual case and pipeline runs
# underutilized. It should be noted that T4 out-of-order execution
# logic is so capable that performance gain from 2x interleave is
# not even impressive, ~7-13% over non-interleaved code, largest
# for 256-bit keys.
# To anchor to something else, software implementation processes
# one byte in 29 cycles with 128-bit key on same processor. Intel
# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
# in 0.93, naturally with AES-NI.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "sparcv9_modes.pl";
&asm_init(@ARGV);
$::evp=1; # if $evp is set to 0, script generates module with
# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
# points. These however are not fully compatible with openssl/aes.h,
# because they expect AES_KEY to be aligned at 64-bit boundary. When
# used through EVP, alignment is arranged at EVP layer. Second thing
# that is arranged by EVP is at least 32-bit alignment of IV.
######################################################################
# single-round subroutines
#
{
my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
$code.=<<___ if ($::abibits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.text
.globl aes_t4_encrypt
.align 32
aes_t4_encrypt:
andcc $inp, 7, %g1 ! is input aligned?
andn $inp, 7, $inp
ldx [$key + 0], %g4
ldx [$key + 8], %g5
ldx [$inp + 0], %o4
bz,pt %icc, 1f
ldx [$inp + 8], %o5
ldx [$inp + 16], $inp
sll %g1, 3, %g1
sub %g0, %g1, %o3
sllx %o4, %g1, %o4
sllx %o5, %g1, %g1
srlx %o5, %o3, %o5
srlx $inp, %o3, %o3
or %o5, %o4, %o4
or %o3, %g1, %o5
1:
ld [$key + 240], $rounds
ldd [$key + 16], %f12
ldd [$key + 24], %f14
xor %g4, %o4, %o4
xor %g5, %o5, %o5
movxtod %o4, %f0
movxtod %o5, %f2
srl $rounds, 1, $rounds
ldd [$key + 32], %f16
sub $rounds, 1, $rounds
ldd [$key + 40], %f18
add $key, 48, $key
.Lenc:
aes_eround01 %f12, %f0, %f2, %f4
aes_eround23 %f14, %f0, %f2, %f2
ldd [$key + 0], %f12
ldd [$key + 8], %f14
sub $rounds,1,$rounds
aes_eround01 %f16, %f4, %f2, %f0
aes_eround23 %f18, %f4, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
brnz,pt $rounds, .Lenc
add $key, 32, $key
andcc $out, 7, $tmp ! is output aligned?
aes_eround01 %f12, %f0, %f2, %f4
aes_eround23 %f14, %f0, %f2, %f2
aes_eround01_l %f16, %f4, %f2, %f0
aes_eround23_l %f18, %f4, %f2, %f2
bnz,pn %icc, 2f
nop
std %f0, [$out + 0]
retl
std %f2, [$out + 8]
2: alignaddrl $out, %g0, $out
mov 0xff, $mask
srl $mask, $tmp, $mask
faligndata %f0, %f0, %f4
faligndata %f0, %f2, %f6
faligndata %f2, %f2, %f8
stda %f4, [$out + $mask]0xc0 ! partial store
std %f6, [$out + 8]
add $out, 16, $out
orn %g0, $mask, $mask
retl
stda %f8, [$out + $mask]0xc0 ! partial store
.type aes_t4_encrypt,#function
.size aes_t4_encrypt,.-aes_t4_encrypt
.globl aes_t4_decrypt
.align 32
aes_t4_decrypt:
andcc $inp, 7, %g1 ! is input aligned?
andn $inp, 7, $inp
ldx [$key + 0], %g4
ldx [$key + 8], %g5
ldx [$inp + 0], %o4
bz,pt %icc, 1f
ldx [$inp + 8], %o5
ldx [$inp + 16], $inp
sll %g1, 3, %g1
sub %g0, %g1, %o3
sllx %o4, %g1, %o4
sllx %o5, %g1, %g1
srlx %o5, %o3, %o5
srlx $inp, %o3, %o3
or %o5, %o4, %o4
or %o3, %g1, %o5
1:
ld [$key + 240], $rounds
ldd [$key + 16], %f12
ldd [$key + 24], %f14
xor %g4, %o4, %o4
xor %g5, %o5, %o5
movxtod %o4, %f0
movxtod %o5, %f2
srl $rounds, 1, $rounds
ldd [$key + 32], %f16
sub $rounds, 1, $rounds
ldd [$key + 40], %f18
add $key, 48, $key
.Ldec:
aes_dround01 %f12, %f0, %f2, %f4
aes_dround23 %f14, %f0, %f2, %f2
ldd [$key + 0], %f12
ldd [$key + 8], %f14
sub $rounds,1,$rounds
aes_dround01 %f16, %f4, %f2, %f0
aes_dround23 %f18, %f4, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
brnz,pt $rounds, .Ldec
add $key, 32, $key
andcc $out, 7, $tmp ! is output aligned?
aes_dround01 %f12, %f0, %f2, %f4
aes_dround23 %f14, %f0, %f2, %f2
aes_dround01_l %f16, %f4, %f2, %f0
aes_dround23_l %f18, %f4, %f2, %f2
bnz,pn %icc, 2f
nop
std %f0, [$out + 0]
retl
std %f2, [$out + 8]
2: alignaddrl $out, %g0, $out
mov 0xff, $mask
srl $mask, $tmp, $mask
faligndata %f0, %f0, %f4
faligndata %f0, %f2, %f6
faligndata %f2, %f2, %f8
stda %f4, [$out + $mask]0xc0 ! partial store
std %f6, [$out + 8]
add $out, 16, $out
orn %g0, $mask, $mask
retl
stda %f8, [$out + $mask]0xc0 ! partial store
.type aes_t4_decrypt,#function
.size aes_t4_decrypt,.-aes_t4_decrypt
___
}
######################################################################
# key setup subroutines
#
{
my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
$code.=<<___;
.globl aes_t4_set_encrypt_key
.align 32
aes_t4_set_encrypt_key:
.Lset_encrypt_key:
and $inp, 7, $tmp
alignaddr $inp, %g0, $inp
cmp $bits, 192
ldd [$inp + 0], %f0
bl,pt %icc,.L128
ldd [$inp + 8], %f2
be,pt %icc,.L192
ldd [$inp + 16], %f4
brz,pt $tmp, .L256aligned
ldd [$inp + 24], %f6
ldd [$inp + 32], %f8
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
faligndata %f4, %f6, %f4
faligndata %f6, %f8, %f6
.L256aligned:
___
for ($i=0; $i<6; $i++) {
$code.=<<___;
std %f0, [$out + `32*$i+0`]
aes_kexpand1 %f0, %f6, $i, %f0
std %f2, [$out + `32*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `32*$i+16`]
aes_kexpand0 %f4, %f2, %f4
std %f6, [$out + `32*$i+24`]
aes_kexpand2 %f6, %f4, %f6
___
}
$code.=<<___;
std %f0, [$out + `32*$i+0`]
aes_kexpand1 %f0, %f6, $i, %f0
std %f2, [$out + `32*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `32*$i+16`]
std %f6, [$out + `32*$i+24`]
std %f0, [$out + `32*$i+32`]
std %f2, [$out + `32*$i+40`]
mov 14, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.align 16
.L192:
brz,pt $tmp, .L192aligned
nop
ldd [$inp + 24], %f6
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
faligndata %f4, %f6, %f4
.L192aligned:
___
for ($i=0; $i<7; $i++) {
$code.=<<___;
std %f0, [$out + `24*$i+0`]
aes_kexpand1 %f0, %f4, $i, %f0
std %f2, [$out + `24*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `24*$i+16`]
aes_kexpand2 %f4, %f2, %f4
___
}
$code.=<<___;
std %f0, [$out + `24*$i+0`]
aes_kexpand1 %f0, %f4, $i, %f0
std %f2, [$out + `24*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `24*$i+16`]
std %f0, [$out + `24*$i+24`]
std %f2, [$out + `24*$i+32`]
mov 12, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.align 16
.L128:
brz,pt $tmp, .L128aligned
nop
ldd [$inp + 16], %f4
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
.L128aligned:
___
for ($i=0; $i<10; $i++) {
$code.=<<___;
std %f0, [$out + `16*$i+0`]
aes_kexpand1 %f0, %f2, $i, %f0
std %f2, [$out + `16*$i+8`]
aes_kexpand2 %f2, %f0, %f2
___
}
$code.=<<___;
std %f0, [$out + `16*$i+0`]
std %f2, [$out + `16*$i+8`]
mov 10, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.type aes_t4_set_encrypt_key,#function
.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
.globl aes_t4_set_decrypt_key
.align 32
aes_t4_set_decrypt_key:
mov %o7, %o5
call .Lset_encrypt_key
nop
mov %o5, %o7
sll $tmp, 4, $inp ! $tmp is number of rounds
add $tmp, 2, $tmp
add $out, $inp, $inp ! $inp=$out+16*rounds
srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
.Lkey_flip:
ldd [$out + 0], %f0
ldd [$out + 8], %f2
ldd [$out + 16], %f4
ldd [$out + 24], %f6
ldd [$inp + 0], %f8
ldd [$inp + 8], %f10
ldd [$inp - 16], %f12
ldd [$inp - 8], %f14
sub $tmp, 1, $tmp
std %f0, [$inp + 0]
std %f2, [$inp + 8]
std %f4, [$inp - 16]
std %f6, [$inp - 8]
std %f8, [$out + 0]
std %f10, [$out + 8]
std %f12, [$out + 16]
std %f14, [$out + 24]
add $out, 32, $out
brnz $tmp, .Lkey_flip
sub $inp, 32, $inp
retl
xor %o0, %o0, %o0
.type aes_t4_set_decrypt_key,#function
.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
___
}
{{{
my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
$code.=<<___;
.align 32
_aes128_encrypt_1x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f48, %f0, %f2, %f4
aes_eround23 %f50, %f0, %f2, %f2
aes_eround01_l %f52, %f4, %f2, %f0
retl
aes_eround23_l %f54, %f4, %f2, %f2
.type _aes128_encrypt_1x,#function
.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
.align 32
_aes128_encrypt_2x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f48, %f0, %f2, %f8
aes_eround23 %f50, %f0, %f2, %f2
aes_eround01 %f48, %f4, %f6, %f10
aes_eround23 %f50, %f4, %f6, %f6
aes_eround01_l %f52, %f8, %f2, %f0
aes_eround23_l %f54, %f8, %f2, %f2
aes_eround01_l %f52, %f10, %f6, %f4
retl
aes_eround23_l %f54, %f10, %f6, %f6
.type _aes128_encrypt_2x,#function
.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
.align 32
_aes128_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<22;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes128_loadkey,#function
.size _aes128_loadkey,.-_aes128_loadkey
_aes128_load_enckey=_aes128_loadkey
_aes128_load_deckey=_aes128_loadkey
___
&alg_cbc_encrypt_implement("aes",128);
if ($::evp) {
&alg_ctr32_implement("aes",128);
&alg_xts_implement("aes",128,"en");
&alg_xts_implement("aes",128,"de");
}
&alg_cbc_decrypt_implement("aes",128);
$code.=<<___;
.align 32
_aes128_decrypt_1x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f48, %f0, %f2, %f4
aes_dround23 %f50, %f0, %f2, %f2
aes_dround01_l %f52, %f4, %f2, %f0
retl
aes_dround23_l %f54, %f4, %f2, %f2
.type _aes128_decrypt_1x,#function
.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
.align 32
_aes128_decrypt_2x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f48, %f0, %f2, %f8
aes_dround23 %f50, %f0, %f2, %f2
aes_dround01 %f48, %f4, %f6, %f10
aes_dround23 %f50, %f4, %f6, %f6
aes_dround01_l %f52, %f8, %f2, %f0
aes_dround23_l %f54, %f8, %f2, %f2
aes_dround01_l %f52, %f10, %f6, %f4
retl
aes_dround23_l %f54, %f10, %f6, %f6
.type _aes128_decrypt_2x,#function
.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
___
$code.=<<___;
.align 32
_aes192_encrypt_1x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f56, %f0, %f2, %f4
aes_eround23 %f58, %f0, %f2, %f2
aes_eround01_l %f60, %f4, %f2, %f0
retl
aes_eround23_l %f62, %f4, %f2, %f2
.type _aes192_encrypt_1x,#function
.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
.align 32
_aes192_encrypt_2x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f56, %f0, %f2, %f8
aes_eround23 %f58, %f0, %f2, %f2
aes_eround01 %f56, %f4, %f6, %f10
aes_eround23 %f58, %f4, %f6, %f6
aes_eround01_l %f60, %f8, %f2, %f0
aes_eround23_l %f62, %f8, %f2, %f2
aes_eround01_l %f60, %f10, %f6, %f4
retl
aes_eround23_l %f62, %f10, %f6, %f6
.type _aes192_encrypt_2x,#function
.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
.align 32
_aes256_encrypt_1x:
aes_eround01 %f16, %f0, %f2, %f4
aes_eround23 %f18, %f0, %f2, %f2
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_eround01 %f20, %f4, %f2, %f0
aes_eround23 %f22, %f4, %f2, %f2
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f16, %f0, %f2, %f4
aes_eround23 %f18, %f0, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_eround01_l %f20, %f4, %f2, %f0
aes_eround23_l %f22, %f4, %f2, %f2
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_encrypt_1x,#function
.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
.align 32
_aes256_encrypt_2x:
aes_eround01 %f16, %f0, %f2, %f8
aes_eround23 %f18, %f0, %f2, %f2
aes_eround01 %f16, %f4, %f6, %f10
aes_eround23 %f18, %f4, %f6, %f6
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_eround01 %f20, %f8, %f2, %f0
aes_eround23 %f22, %f8, %f2, %f2
aes_eround01 %f20, %f10, %f6, %f4
aes_eround23 %f22, %f10, %f6, %f6
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f16, %f0, %f2, %f8
aes_eround23 %f18, %f0, %f2, %f2
aes_eround01 %f16, %f4, %f6, %f10
aes_eround23 %f18, %f4, %f6, %f6
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_eround01_l %f20, %f8, %f2, %f0
aes_eround23_l %f22, %f8, %f2, %f2
aes_eround01_l %f20, %f10, %f6, %f4
aes_eround23_l %f22, %f10, %f6, %f6
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_encrypt_2x,#function
.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
.align 32
_aes192_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<26;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes192_loadkey,#function
.size _aes192_loadkey,.-_aes192_loadkey
_aes256_loadkey=_aes192_loadkey
_aes192_load_enckey=_aes192_loadkey
_aes192_load_deckey=_aes192_loadkey
_aes256_load_enckey=_aes192_loadkey
_aes256_load_deckey=_aes192_loadkey
___
&alg_cbc_encrypt_implement("aes",256);
&alg_cbc_encrypt_implement("aes",192);
if ($::evp) {
&alg_ctr32_implement("aes",256);
&alg_xts_implement("aes",256,"en");
&alg_xts_implement("aes",256,"de");
&alg_ctr32_implement("aes",192);
}
&alg_cbc_decrypt_implement("aes",192);
&alg_cbc_decrypt_implement("aes",256);
$code.=<<___;
.align 32
_aes256_decrypt_1x:
aes_dround01 %f16, %f0, %f2, %f4
aes_dround23 %f18, %f0, %f2, %f2
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_dround01 %f20, %f4, %f2, %f0
aes_dround23 %f22, %f4, %f2, %f2
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f16, %f0, %f2, %f4
aes_dround23 %f18, %f0, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_dround01_l %f20, %f4, %f2, %f0
aes_dround23_l %f22, %f4, %f2, %f2
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_decrypt_1x,#function
.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
.align 32
_aes256_decrypt_2x:
aes_dround01 %f16, %f0, %f2, %f8
aes_dround23 %f18, %f0, %f2, %f2
aes_dround01 %f16, %f4, %f6, %f10
aes_dround23 %f18, %f4, %f6, %f6
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_dround01 %f20, %f8, %f2, %f0
aes_dround23 %f22, %f8, %f2, %f2
aes_dround01 %f20, %f10, %f6, %f4
aes_dround23 %f22, %f10, %f6, %f6
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f16, %f0, %f2, %f8
aes_dround23 %f18, %f0, %f2, %f2
aes_dround01 %f16, %f4, %f6, %f10
aes_dround23 %f18, %f4, %f6, %f6
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_dround01_l %f20, %f8, %f2, %f0
aes_dround23_l %f22, %f8, %f2, %f2
aes_dround01_l %f20, %f10, %f6, %f4
aes_dround23_l %f22, %f10, %f6, %f6
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_decrypt_2x,#function
.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
.align 32
_aes192_decrypt_1x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f4
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01_l %f60, %f4, %f2, %f0
retl
aes_dround23_l %f62, %f4, %f2, %f2
.type _aes192_decrypt_1x,#function
.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
.align 32
_aes192_decrypt_2x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f8
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01 %f56, %f4, %f6, %f10
aes_dround23 %f58, %f4, %f6, %f6
aes_dround01_l %f60, %f8, %f2, %f0
aes_dround23_l %f62, %f8, %f2, %f2
aes_dround01_l %f60, %f10, %f6, %f4
retl
aes_dround23_l %f62, %f10, %f6, %f6
.type _aes192_decrypt_2x,#function
.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
___
}}}
if (!$::evp) {
$code.=<<___;
.global AES_encrypt
AES_encrypt=aes_t4_encrypt
.global AES_decrypt
AES_decrypt=aes_t4_decrypt
.global AES_set_encrypt_key
.align 32
AES_set_encrypt_key:
andcc %o2, 7, %g0 ! check alignment
bnz,a,pn %icc, 1f
mov -1, %o0
brz,a,pn %o0, 1f
mov -1, %o0
brz,a,pn %o2, 1f
mov -1, %o0
andncc %o1, 0x1c0, %g0
bnz,a,pn %icc, 1f
mov -2, %o0
cmp %o1, 128
bl,a,pn %icc, 1f
mov -2, %o0
b aes_t4_set_encrypt_key
nop
1: retl
nop
.type AES_set_encrypt_key,#function
.size AES_set_encrypt_key,.-AES_set_encrypt_key
.global AES_set_decrypt_key
.align 32
AES_set_decrypt_key:
andcc %o2, 7, %g0 ! check alignment
bnz,a,pn %icc, 1f
mov -1, %o0
brz,a,pn %o0, 1f
mov -1, %o0
brz,a,pn %o2, 1f
mov -1, %o0
andncc %o1, 0x1c0, %g0
bnz,a,pn %icc, 1f
mov -2, %o0
cmp %o1, 128
bl,a,pn %icc, 1f
mov -2, %o0
b aes_t4_set_decrypt_key
nop
1: retl
nop
.type AES_set_decrypt_key,#function
.size AES_set_decrypt_key,.-AES_set_decrypt_key
___
my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
$code.=<<___;
.globl AES_cbc_encrypt
.align 32
AES_cbc_encrypt:
ld [$key + 240], %g1
nop
brz $enc, .Lcbc_decrypt
cmp %g1, 12
bl,pt %icc, aes128_t4_cbc_encrypt
nop
be,pn %icc, aes192_t4_cbc_encrypt
nop
ba aes256_t4_cbc_encrypt
nop
.Lcbc_decrypt:
bl,pt %icc, aes128_t4_cbc_decrypt
nop
be,pn %icc, aes192_t4_cbc_decrypt
nop
ba aes256_t4_cbc_decrypt
nop
.type AES_cbc_encrypt,#function
.size AES_cbc_encrypt,.-AES_cbc_encrypt
___
}
$code.=<<___;
.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
.align 4
___
&emit_assembler();
close STDOUT;

View File

@@ -0,0 +1,989 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for ARMv8 AES instructions. The
# module is endian-agnostic in sense that it supports both big- and
# little-endian cases. As does it support both 32- and 64-bit modes
# of operation. Latter is achieved by limiting amount of utilized
# registers to 16, which implies additional NEON load and integer
# instructions. This has no effect on mighty Apple A7, where results
# are literally equal to the theoretical estimates based on AES
# instruction latencies and issue rates. On Cortex-A53, an in-order
# execution core, this costs up to 10-15%, which is partially
# compensated by implementing dedicated code path for 128-bit
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
# seems to be limited by sheer amount of NEON instructions...
#
# Performance in cycles per byte processed with 128-bit key:
#
# CBC enc CBC dec CTR
# Apple A7 2.39 1.20 1.20
# Cortex-A53 1.32 1.29 1.46
# Cortex-A57(*) 1.95 0.85 0.93
# Denver 1.96 0.86 0.80
#
# (*) original 3.64/1.34/1.32 results were for r0p0 revision
# and are still same even for updated module;
$flavour = shift;
open STDOUT,">".shift;
$prefix="aes_v8";
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
#^^^^^^ this is done to simplify adoption by not depending
# on latest binutils.
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
# maintain both 32- and 64-bit codes within single module and
# transliterate common code to either flavour with regex vodoo.
#
{{{
my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
$code.=<<___;
.align 5
rcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.globl ${prefix}_set_encrypt_key
.type ${prefix}_set_encrypt_key,%function
.align 5
${prefix}_set_encrypt_key:
.Lenc_key:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___;
mov $ptr,#-1
cmp $inp,#0
b.eq .Lenc_key_abort
cmp $out,#0
b.eq .Lenc_key_abort
mov $ptr,#-2
cmp $bits,#128
b.lt .Lenc_key_abort
cmp $bits,#256
b.gt .Lenc_key_abort
tst $bits,#0x3f
b.ne .Lenc_key_abort
adr $ptr,rcon
cmp $bits,#192
veor $zero,$zero,$zero
vld1.8 {$in0},[$inp],#16
mov $bits,#8 // reuse $bits
vld1.32 {$rcon,$mask},[$ptr],#32
b.lt .Loop128
b.eq .L192
b .L256
.align 4
.Loop128:
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
b.ne .Loop128
vld1.32 {$rcon},[$ptr]
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
veor $in0,$in0,$key
vst1.32 {$in0},[$out]
add $out,$out,#0x50
mov $rounds,#10
b .Ldone
.align 4
.L192:
vld1.8 {$in1},[$inp],#8
vmov.i8 $key,#8 // borrow $key
vst1.32 {$in0},[$out],#16
vsub.i8 $mask,$mask,$key // adjust the mask
.Loop192:
vtbl.8 $key,{$in1},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in1},[$out],#8
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vdup.32 $tmp,${in0}[3]
veor $tmp,$tmp,$in1
veor $key,$key,$rcon
vext.8 $in1,$zero,$in1,#12
vshl.u8 $rcon,$rcon,#1
veor $in1,$in1,$tmp
veor $in0,$in0,$key
veor $in1,$in1,$key
vst1.32 {$in0},[$out],#16
b.ne .Loop192
mov $rounds,#12
add $out,$out,#0x20
b .Ldone
.align 4
.L256:
vld1.8 {$in1},[$inp]
mov $bits,#7
mov $rounds,#14
vst1.32 {$in0},[$out],#16
.Loop256:
vtbl.8 $key,{$in1},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in1},[$out],#16
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
vst1.32 {$in0},[$out],#16
b.eq .Ldone
vdup.32 $key,${in0}[3] // just splat
vext.8 $tmp,$zero,$in1,#12
aese $key,$zero
veor $in1,$in1,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in1,$in1,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in1,$in1,$tmp
veor $in1,$in1,$key
b .Loop256
.Ldone:
str $rounds,[$out]
mov $ptr,#0
.Lenc_key_abort:
mov x0,$ptr // return value
`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
ret
.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
.globl ${prefix}_set_decrypt_key
.type ${prefix}_set_decrypt_key,%function
.align 5
${prefix}_set_decrypt_key:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
stmdb sp!,{r4,lr}
___
$code.=<<___;
bl .Lenc_key
cmp x0,#0
b.ne .Ldec_key_abort
sub $out,$out,#240 // restore original $out
mov x4,#-16
add $inp,$out,x12,lsl#4 // end of key schedule
vld1.32 {v0.16b},[$out]
vld1.32 {v1.16b},[$inp]
vst1.32 {v0.16b},[$inp],x4
vst1.32 {v1.16b},[$out],#16
.Loop_imc:
vld1.32 {v0.16b},[$out]
vld1.32 {v1.16b},[$inp]
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
vst1.32 {v0.16b},[$inp],x4
vst1.32 {v1.16b},[$out],#16
cmp $inp,$out
b.hi .Loop_imc
vld1.32 {v0.16b},[$out]
aesimc v0.16b,v0.16b
vst1.32 {v0.16b},[$inp]
eor x0,x0,x0 // return value
.Ldec_key_abort:
___
$code.=<<___ if ($flavour !~ /64/);
ldmia sp!,{r4,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldp x29,x30,[sp],#16
ret
___
$code.=<<___;
.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
___
}}}
{{{
sub gen_block () {
my $dir = shift;
my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
my ($inp,$out,$key)=map("x$_",(0..2));
my $rounds="w3";
my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
$code.=<<___;
.globl ${prefix}_${dir}crypt
.type ${prefix}_${dir}crypt,%function
.align 5
${prefix}_${dir}crypt:
ldr $rounds,[$key,#240]
vld1.32 {$rndkey0},[$key],#16
vld1.8 {$inout},[$inp]
sub $rounds,$rounds,#2
vld1.32 {$rndkey1},[$key],#16
.Loop_${dir}c:
aes$e $inout,$rndkey0
aes$mc $inout,$inout
vld1.32 {$rndkey0},[$key],#16
subs $rounds,$rounds,#2
aes$e $inout,$rndkey1
aes$mc $inout,$inout
vld1.32 {$rndkey1},[$key],#16
b.gt .Loop_${dir}c
aes$e $inout,$rndkey0
aes$mc $inout,$inout
vld1.32 {$rndkey0},[$key]
aes$e $inout,$rndkey1
veor $inout,$inout,$rndkey0
vst1.8 {$inout},[$out]
ret
.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
### q8-q15 preloaded key schedule
$code.=<<___;
.globl ${prefix}_cbc_encrypt
.type ${prefix}_cbc_encrypt,%function
.align 5
${prefix}_cbc_encrypt:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
mov ip,sp
stmdb sp!,{r4-r8,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load remaining args
___
$code.=<<___;
subs $len,$len,#16
mov $step,#16
b.lo .Lcbc_abort
cclr $step,eq
cmp $enc,#0 // en- or decrypting?
ldr $rounds,[$key,#240]
and $len,$len,#-16
vld1.8 {$ivec},[$ivp]
vld1.8 {$dat},[$inp],$step
vld1.32 {q8-q9},[$key] // load key schedule...
sub $rounds,$rounds,#6
add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
sub $rounds,$rounds,#2
vld1.32 {q10-q11},[$key_],#32
vld1.32 {q12-q13},[$key_],#32
vld1.32 {q14-q15},[$key_],#32
vld1.32 {$rndlast},[$key_]
add $key_,$key,#32
mov $cnt,$rounds
b.eq .Lcbc_dec
cmp $rounds,#2
veor $dat,$dat,$ivec
veor $rndzero_n_last,q8,$rndlast
b.eq .Lcbc_enc128
vld1.32 {$in0-$in1},[$key_]
add $key_,$key,#16
add $key4,$key,#16*4
add $key5,$key,#16*5
aese $dat,q8
aesmc $dat,$dat
add $key6,$key,#16*6
add $key7,$key,#16*7
b .Lenter_cbc_enc
.align 4
.Loop_cbc_enc:
aese $dat,q8
aesmc $dat,$dat
vst1.8 {$ivec},[$out],#16
.Lenter_cbc_enc:
aese $dat,q9
aesmc $dat,$dat
aese $dat,$in0
aesmc $dat,$dat
vld1.32 {q8},[$key4]
cmp $rounds,#4
aese $dat,$in1
aesmc $dat,$dat
vld1.32 {q9},[$key5]
b.eq .Lcbc_enc192
aese $dat,q8
aesmc $dat,$dat
vld1.32 {q8},[$key6]
aese $dat,q9
aesmc $dat,$dat
vld1.32 {q9},[$key7]
nop
.Lcbc_enc192:
aese $dat,q8
aesmc $dat,$dat
subs $len,$len,#16
aese $dat,q9
aesmc $dat,$dat
cclr $step,eq
aese $dat,q10
aesmc $dat,$dat
aese $dat,q11
aesmc $dat,$dat
vld1.8 {q8},[$inp],$step
aese $dat,q12
aesmc $dat,$dat
veor q8,q8,$rndzero_n_last
aese $dat,q13
aesmc $dat,$dat
vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
aese $dat,q14
aesmc $dat,$dat
aese $dat,q15
veor $ivec,$dat,$rndlast
b.hs .Loop_cbc_enc
vst1.8 {$ivec},[$out],#16
b .Lcbc_done
.align 5
.Lcbc_enc128:
vld1.32 {$in0-$in1},[$key_]
aese $dat,q8
aesmc $dat,$dat
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
aese $dat,q8
aesmc $dat,$dat
vst1.8 {$ivec},[$out],#16
.Lenter_cbc_enc128:
aese $dat,q9
aesmc $dat,$dat
subs $len,$len,#16
aese $dat,$in0
aesmc $dat,$dat
cclr $step,eq
aese $dat,$in1
aesmc $dat,$dat
aese $dat,q10
aesmc $dat,$dat
aese $dat,q11
aesmc $dat,$dat
vld1.8 {q8},[$inp],$step
aese $dat,q12
aesmc $dat,$dat
aese $dat,q13
aesmc $dat,$dat
aese $dat,q14
aesmc $dat,$dat
veor q8,q8,$rndzero_n_last
aese $dat,q15
veor $ivec,$dat,$rndlast
b.hs .Loop_cbc_enc128
vst1.8 {$ivec},[$out],#16
b .Lcbc_done
___
{
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
$code.=<<___;
.align 5
.Lcbc_dec:
vld1.8 {$dat2},[$inp],#16
subs $len,$len,#32 // bias
add $cnt,$rounds,#2
vorr $in1,$dat,$dat
vorr $dat1,$dat,$dat
vorr $in2,$dat2,$dat2
b.lo .Lcbc_dec_tail
vorr $dat1,$dat2,$dat2
vld1.8 {$dat2},[$inp],#16
vorr $in0,$dat,$dat
vorr $in1,$dat1,$dat1
vorr $in2,$dat2,$dat2
.Loop3x_cbc_dec:
aesd $dat0,q8
aesimc $dat0,$dat0
aesd $dat1,q8
aesimc $dat1,$dat1
aesd $dat2,q8
aesimc $dat2,$dat2
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aesd $dat0,q9
aesimc $dat0,$dat0
aesd $dat1,q9
aesimc $dat1,$dat1
aesd $dat2,q9
aesimc $dat2,$dat2
vld1.32 {q9},[$key_],#16
b.gt .Loop3x_cbc_dec
aesd $dat0,q8
aesimc $dat0,$dat0
aesd $dat1,q8
aesimc $dat1,$dat1
aesd $dat2,q8
aesimc $dat2,$dat2
veor $tmp0,$ivec,$rndlast
subs $len,$len,#0x30
veor $tmp1,$in0,$rndlast
mov.lo x6,$len // x6, $cnt, is zero at this point
aesd $dat0,q9
aesimc $dat0,$dat0
aesd $dat1,q9
aesimc $dat1,$dat1
aesd $dat2,q9
aesimc $dat2,$dat2
veor $tmp2,$in1,$rndlast
add $inp,$inp,x6 // $inp is adjusted in such way that
// at exit from the loop $dat1-$dat2
// are loaded with last "words"
vorr $ivec,$in2,$in2
mov $key_,$key
aesd $dat0,q12
aesimc $dat0,$dat0
aesd $dat1,q12
aesimc $dat1,$dat1
aesd $dat2,q12
aesimc $dat2,$dat2
vld1.8 {$in0},[$inp],#16
aesd $dat0,q13
aesimc $dat0,$dat0
aesd $dat1,q13
aesimc $dat1,$dat1
aesd $dat2,q13
aesimc $dat2,$dat2
vld1.8 {$in1},[$inp],#16
aesd $dat0,q14
aesimc $dat0,$dat0
aesd $dat1,q14
aesimc $dat1,$dat1
aesd $dat2,q14
aesimc $dat2,$dat2
vld1.8 {$in2},[$inp],#16
aesd $dat0,q15
aesd $dat1,q15
aesd $dat2,q15
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
add $cnt,$rounds,#2
veor $tmp0,$tmp0,$dat0
veor $tmp1,$tmp1,$dat1
veor $dat2,$dat2,$tmp2
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vst1.8 {$tmp0},[$out],#16
vorr $dat0,$in0,$in0
vst1.8 {$tmp1},[$out],#16
vorr $dat1,$in1,$in1
vst1.8 {$dat2},[$out],#16
vorr $dat2,$in2,$in2
b.hs .Loop3x_cbc_dec
cmn $len,#0x30
b.eq .Lcbc_done
nop
.Lcbc_dec_tail:
aesd $dat1,q8
aesimc $dat1,$dat1
aesd $dat2,q8
aesimc $dat2,$dat2
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aesd $dat1,q9
aesimc $dat1,$dat1
aesd $dat2,q9
aesimc $dat2,$dat2
vld1.32 {q9},[$key_],#16
b.gt .Lcbc_dec_tail
aesd $dat1,q8
aesimc $dat1,$dat1
aesd $dat2,q8
aesimc $dat2,$dat2
aesd $dat1,q9
aesimc $dat1,$dat1
aesd $dat2,q9
aesimc $dat2,$dat2
aesd $dat1,q12
aesimc $dat1,$dat1
aesd $dat2,q12
aesimc $dat2,$dat2
cmn $len,#0x20
aesd $dat1,q13
aesimc $dat1,$dat1
aesd $dat2,q13
aesimc $dat2,$dat2
veor $tmp1,$ivec,$rndlast
aesd $dat1,q14
aesimc $dat1,$dat1
aesd $dat2,q14
aesimc $dat2,$dat2
veor $tmp2,$in1,$rndlast
aesd $dat1,q15
aesd $dat2,q15
b.eq .Lcbc_dec_one
veor $tmp1,$tmp1,$dat1
veor $tmp2,$tmp2,$dat2
vorr $ivec,$in2,$in2
vst1.8 {$tmp1},[$out],#16
vst1.8 {$tmp2},[$out],#16
b .Lcbc_done
.Lcbc_dec_one:
veor $tmp1,$tmp1,$dat2
vorr $ivec,$in2,$in2
vst1.8 {$tmp1},[$out],#16
.Lcbc_done:
vst1.8 {$ivec},[$ivp]
.Lcbc_abort:
___
}
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r8,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldr x29,[sp],#16
ret
___
$code.=<<___;
.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
my ($rounds,$cnt,$key_)=("w5","w6","x7");
my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
my $step="x12"; # aliases with $tctr2
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
my ($dat,$tmp)=($dat0,$tmp0);
### q8-q15 preloaded key schedule
$code.=<<___;
.globl ${prefix}_ctr32_encrypt_blocks
.type ${prefix}_ctr32_encrypt_blocks,%function
.align 5
${prefix}_ctr32_encrypt_blocks:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
mov ip,sp
stmdb sp!,{r4-r10,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldr r4, [ip] @ load remaining arg
___
$code.=<<___;
ldr $rounds,[$key,#240]
ldr $ctr, [$ivp, #12]
vld1.32 {$dat0},[$ivp]
vld1.32 {q8-q9},[$key] // load key schedule...
sub $rounds,$rounds,#4
mov $step,#16
cmp $len,#2
add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
sub $rounds,$rounds,#2
vld1.32 {q12-q13},[$key_],#32
vld1.32 {q14-q15},[$key_],#32
vld1.32 {$rndlast},[$key_]
add $key_,$key,#32
mov $cnt,$rounds
cclr $step,lo
#ifndef __ARMEB__
rev $ctr, $ctr
#endif
vorr $dat1,$dat0,$dat0
add $tctr1, $ctr, #1
vorr $dat2,$dat0,$dat0
add $ctr, $ctr, #2
vorr $ivec,$dat0,$dat0
rev $tctr1, $tctr1
vmov.32 ${dat1}[3],$tctr1
b.ls .Lctr32_tail
rev $tctr2, $ctr
sub $len,$len,#3 // bias
vmov.32 ${dat2}[3],$tctr2
b .Loop3x_ctr32
.align 4
.Loop3x_ctr32:
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat2,q8
aesmc $dat2,$dat2
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
aese $dat2,q9
aesmc $dat2,$dat2
vld1.32 {q9},[$key_],#16
b.gt .Loop3x_ctr32
aese $dat0,q8
aesmc $tmp0,$dat0
aese $dat1,q8
aesmc $tmp1,$dat1
vld1.8 {$in0},[$inp],#16
vorr $dat0,$ivec,$ivec
aese $dat2,q8
aesmc $dat2,$dat2
vld1.8 {$in1},[$inp],#16
vorr $dat1,$ivec,$ivec
aese $tmp0,q9
aesmc $tmp0,$tmp0
aese $tmp1,q9
aesmc $tmp1,$tmp1
vld1.8 {$in2},[$inp],#16
mov $key_,$key
aese $dat2,q9
aesmc $tmp2,$dat2
vorr $dat2,$ivec,$ivec
add $tctr0,$ctr,#1
aese $tmp0,q12
aesmc $tmp0,$tmp0
aese $tmp1,q12
aesmc $tmp1,$tmp1
veor $in0,$in0,$rndlast
add $tctr1,$ctr,#2
aese $tmp2,q12
aesmc $tmp2,$tmp2
veor $in1,$in1,$rndlast
add $ctr,$ctr,#3
aese $tmp0,q13
aesmc $tmp0,$tmp0
aese $tmp1,q13
aesmc $tmp1,$tmp1
veor $in2,$in2,$rndlast
rev $tctr0,$tctr0
aese $tmp2,q13
aesmc $tmp2,$tmp2
vmov.32 ${dat0}[3], $tctr0
rev $tctr1,$tctr1
aese $tmp0,q14
aesmc $tmp0,$tmp0
aese $tmp1,q14
aesmc $tmp1,$tmp1
vmov.32 ${dat1}[3], $tctr1
rev $tctr2,$ctr
aese $tmp2,q14
aesmc $tmp2,$tmp2
vmov.32 ${dat2}[3], $tctr2
subs $len,$len,#3
aese $tmp0,q15
aese $tmp1,q15
aese $tmp2,q15
veor $in0,$in0,$tmp0
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
vst1.8 {$in0},[$out],#16
veor $in1,$in1,$tmp1
mov $cnt,$rounds
vst1.8 {$in1},[$out],#16
veor $in2,$in2,$tmp2
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vst1.8 {$in2},[$out],#16
b.hs .Loop3x_ctr32
adds $len,$len,#3
b.eq .Lctr32_done
cmp $len,#1
mov $step,#16
cclr $step,eq
.Lctr32_tail:
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
vld1.32 {q9},[$key_],#16
b.gt .Lctr32_tail
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
vld1.8 {$in0},[$inp],$step
aese $dat0,q12
aesmc $dat0,$dat0
aese $dat1,q12
aesmc $dat1,$dat1
vld1.8 {$in1},[$inp]
aese $dat0,q13
aesmc $dat0,$dat0
aese $dat1,q13
aesmc $dat1,$dat1
veor $in0,$in0,$rndlast
aese $dat0,q14
aesmc $dat0,$dat0
aese $dat1,q14
aesmc $dat1,$dat1
veor $in1,$in1,$rndlast
aese $dat0,q15
aese $dat1,q15
cmp $len,#1
veor $in0,$in0,$dat0
veor $in1,$in1,$dat1
vst1.8 {$in0},[$out],#16
b.eq .Lctr32_done
vst1.8 {$in1},[$out]
.Lctr32_done:
___
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r10,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldr x29,[sp],#16
ret
___
$code.=<<___;
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
$code.=<<___;
#endif
___
########################################
if ($flavour =~ /64/) { ######## 64-bit code
my %opcode = (
"aesd" => 0x4e285800, "aese" => 0x4e284800,
"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
local *unaes = sub {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5),
$mnemonic,$arg;
};
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vext\.8/ext/o or
s/vrev32\.8/rev32/o or
s/vtst\.8/cmtst/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
# fix up remainig legacy suffixes
s/\.[ui]?8//o;
m/\],#8/o and s/\.16b/\.8b/go;
s/\.[ui]?32//o and s/\.16b/\.4s/go;
s/\.[ui]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
print $_,"\n";
}
} else { ######## 32-bit code
my %opcode = (
"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
local *unaes = sub {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<1) |(($2&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
};
sub unvtbl {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvmov32 {
my $arg=shift;
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remainig new-style suffixes
s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
s/\],#[0-9]+/]!/o;
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/vmov\.32\s+(.*)/unvmov32($1)/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)mov\./$1mov/o or
s/^(\s+)ret/$1bx\tlr/o;
print $_,"\n";
}
}
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,903 @@
#!/usr/bin/env perl
######################################################################
## Constant-time SSSE3 AES core implementation.
## version 0.1
##
## By Mike Hamburg (Stanford University), 2009
## Public domain.
##
## For details see http://shiftleft.org/papers/vector_aes/ and
## http://crypto.stanford.edu/vpaes/.
######################################################################
# September 2011.
#
# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
# doesn't handle partial vectors (doesn't have to if called from
# EVP only). "Drop-in" implies that this module doesn't share key
# schedule structure with the original nor does it make assumption
# about its alignment...
#
# Performance summary. aes-586.pl column lists large-block CBC
# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
# byte processed with 128-bit key, and vpaes-x86.pl column - [also
# large-block CBC] encrypt/decrypt.
#
# aes-586.pl vpaes-x86.pl
#
# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
# Nehalem 27.9/40.4/18.1 10.2/11.9
# Atom 70.7/92.1/60.1 61.1/75.4(***)
# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
# majority of contemporary cores share cache, slower code path
# is common place. In other words "with-hyper-threading-off"
# results are presented mostly for reference purposes.
#
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
# pshufb, yet it's respectable +28%/64% improvement on Core 2
# and +15% on Atom (as implied, over "hyper-threading-safe"
# code path).
#
# <appro@openssl.org>
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
$PREFIX="vpaes";
my ($round, $base, $magic, $key, $const, $inp, $out)=
("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
&static_label("_vpaes_consts");
&static_label("_vpaes_schedule_low_round");
&set_label("_vpaes_consts",64);
$k_inv=-0x30; # inv, inva
&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
$k_s0F=-0x10; # s0F
&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
$k_ipt=0x00; # input transform (lo, hi)
&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
$k_sb1=0x20; # sb1u, sb1t
&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
$k_sb2=0x40; # sb2u, sb2t
&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
$k_sbo=0x60; # sbou, sbot
&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
$k_mc_forward=0x80; # mc_forward
&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
$k_mc_backward=0xc0; # mc_backward
&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
$k_sr=0x100; # sr
&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
$k_rcon=0x140; # rcon
&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
$k_s63=0x150; # s63: all equal to 0x63 transformed
&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
$k_opt=0x160; # output transform
&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
##
## Decryption stuff
## Key schedule constants
##
$k_dksd=0x1a0; # decryption key schedule: invskew x*D
&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
$k_dksb=0x1c0; # decryption key schedule: invskew x*B
&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
$k_dks9=0x200; # decryption key schedule: invskew x*9
&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
##
## Decryption stuff
## Round function constants
##
$k_dipt=0x220; # decryption input transform
&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
$k_dsbo=0x2c0; # decryption sbox final output
&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
&align (64);
&function_begin_B("_vpaes_preheat");
&add ($const,&DWP(0,"esp"));
&movdqa ("xmm7",&QWP($k_inv,$const));
&movdqa ("xmm6",&QWP($k_s0F,$const));
&ret ();
&function_end_B("_vpaes_preheat");
##
## _aes_encrypt_core
##
## AES-encrypt %xmm0.
##
## Inputs:
## %xmm0 = input
## %xmm6-%xmm7 as in _vpaes_preheat
## (%edx) = scheduled keys
##
## Output in %xmm0
## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
##
##
&function_begin_B("_vpaes_encrypt_core");
&mov ($magic,16);
&mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6")
&movdqa ("xmm2",&QWP($k_ipt,$const));
&pandn ("xmm1","xmm0");
&pand ("xmm0","xmm6");
&movdqu ("xmm5",&QWP(0,$key));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
&pxor ("xmm2","xmm5");
&psrld ("xmm1",4);
&add ($key,16);
&pshufb ("xmm0","xmm1");
&lea ($base,&DWP($k_mc_backward,$const));
&pxor ("xmm0","xmm2");
&jmp (&label("enc_entry"));
&set_label("enc_loop",16);
# middle of middle round
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
&pshufb ("xmm4","xmm2"); # 4 = sb1u
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
&pxor ("xmm0","xmm4"); # 0 = A
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
&pshufb ("xmm5","xmm2"); # 4 = sb2u
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
&pshufb ("xmm2","xmm3"); # 2 = sb2t
&movdqa ("xmm3","xmm0"); # 3 = A
&pxor ("xmm2","xmm5"); # 2 = 2A
&pshufb ("xmm0","xmm1"); # 0 = B
&add ($key,16); # next key
&pxor ("xmm0","xmm2"); # 0 = 2A+B
&pshufb ("xmm3","xmm4"); # 3 = D
&add ($magic,16); # next mc
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
&and ($magic,0x30); # ... mod 4
&sub ($round,1); # nr--
&pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
&set_label("enc_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
&movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm6"); # 0 = k
&pshufb ("xmm5","xmm0"); # 2 = a/k
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&movdqa ("xmm4","xmm7"); # 4 : 1/j
&pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
&pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&movdqu ("xmm5",&QWP(0,$key));
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("enc_loop"));
# middle of last round
&movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
&movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
&pshufb ("xmm4","xmm2"); # 4 = sbou
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
&pxor ("xmm0","xmm4"); # 0 = A
&pshufb ("xmm0","xmm1");
&ret ();
&function_end_B("_vpaes_encrypt_core");
##
## Decryption core
##
## Same API as encryption core.
##
&function_begin_B("_vpaes_decrypt_core");
&lea ($base,&DWP($k_dsbd,$const));
&mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6");
&movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
&pandn ("xmm1","xmm0");
&mov ($magic,$round);
&psrld ("xmm1",4)
&movdqu ("xmm5",&QWP(0,$key));
&shl ($magic,4);
&pand ("xmm0","xmm6");
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
&xor ($magic,0x30);
&pshufb ("xmm0","xmm1");
&and ($magic,0x30);
&pxor ("xmm2","xmm5");
&movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
&pxor ("xmm0","xmm2");
&add ($key,16);
&lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
&jmp (&label("dec_entry"));
&set_label("dec_loop",16);
##
## Inverse mix columns
##
&movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
&movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
&pshufb ("xmm4","xmm2"); # 4 = sb9u
&pshufb ("xmm1","xmm3"); # 0 = sb9t
&pxor ("xmm0","xmm4");
&movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
&pxor ("xmm0","xmm1"); # 0 = ch
&movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt
&pshufb ("xmm4","xmm2"); # 4 = sbdu
&pshufb ("xmm0","xmm5"); # MC ch
&pshufb ("xmm1","xmm3"); # 0 = sbdt
&pxor ("xmm0","xmm4"); # 4 = ch
&movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
&pxor ("xmm0","xmm1"); # 0 = ch
&movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
&pshufb ("xmm4","xmm2"); # 4 = sbbu
&pshufb ("xmm0","xmm5"); # MC ch
&pshufb ("xmm1","xmm3"); # 0 = sbbt
&pxor ("xmm0","xmm4"); # 4 = ch
&movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
&pxor ("xmm0","xmm1"); # 0 = ch
&movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet
&pshufb ("xmm4","xmm2"); # 4 = sbeu
&pshufb ("xmm0","xmm5"); # MC ch
&pshufb ("xmm1","xmm3"); # 0 = sbet
&pxor ("xmm0","xmm4"); # 4 = ch
&add ($key,16); # next round key
&palignr("xmm5","xmm5",12);
&pxor ("xmm0","xmm1"); # 0 = ch
&sub ($round,1); # nr--
&set_label("dec_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
&pand ("xmm0","xmm6"); # 0 = k
&psrld ("xmm1",4); # 1 = i
&pshufb ("xmm2","xmm0"); # 2 = a/k
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&movdqa ("xmm4","xmm7"); # 4 : 1/j
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
&pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&movdqu ("xmm0",&QWP(0,$key));
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("dec_loop"));
# middle of last round
&movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
&pshufb ("xmm4","xmm2"); # 4 = sbou
&pxor ("xmm4","xmm0"); # 4 = sb1u + k
&movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
&movdqa ("xmm2",&QWP(0,$magic));
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = A
&pshufb ("xmm0","xmm2");
&ret ();
&function_end_B("_vpaes_decrypt_core");
########################################################
## ##
## AES key schedule ##
## ##
########################################################
&function_begin_B("_vpaes_schedule_core");
&add ($const,&DWP(0,"esp"));
&movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
&movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
# input transform
&movdqa ("xmm3","xmm0");
&lea ($base,&DWP($k_ipt,$const));
&movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
&call ("_vpaes_schedule_transform");
&movdqa ("xmm7","xmm0");
&test ($out,$out);
&jnz (&label("schedule_am_decrypting"));
# encrypting, output zeroth round key after transform
&movdqu (&QWP(0,$key),"xmm0");
&jmp (&label("schedule_go"));
&set_label("schedule_am_decrypting");
# decrypting, output zeroth round key after shiftrows
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&movdqu (&QWP(0,$key),"xmm3");
&xor ($magic,0x30);
&set_label("schedule_go");
&cmp ($round,192);
&ja (&label("schedule_256"));
&je (&label("schedule_192"));
# 128: fall though
##
## .schedule_128
##
## 128-bit specific part of key schedule.
##
## This schedule is really simple, because all its parts
## are accomplished by the subroutines.
##
&set_label("schedule_128");
&mov ($round,10);
&set_label("loop_schedule_128");
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle"); # write output
&jmp (&label("loop_schedule_128"));
##
## .aes_schedule_192
##
## 192-bit specific part of key schedule.
##
## The main body of this schedule is the same as the 128-bit
## schedule, but with more smearing. The long, high side is
## stored in %xmm7 as before, and the short, low side is in
## the high bits of %xmm6.
##
## This schedule is somewhat nastier, however, because each
## round produces 192 bits of key material, or 1.5 round keys.
## Therefore, on each cycle we do 2 rounds and produce 3 round
## keys.
##
&set_label("schedule_192",16);
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&movdqa ("xmm6","xmm0"); # save short part
&pxor ("xmm4","xmm4"); # clear 4
&movhlps("xmm6","xmm4"); # clobber low side with zeros
&mov ($round,4);
&set_label("loop_schedule_192");
&call ("_vpaes_schedule_round");
&palignr("xmm0","xmm6",8);
&call ("_vpaes_schedule_mangle"); # save key n
&call ("_vpaes_schedule_192_smear");
&call ("_vpaes_schedule_mangle"); # save key n+1
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle"); # save key n+2
&call ("_vpaes_schedule_192_smear");
&jmp (&label("loop_schedule_192"));
##
## .aes_schedule_256
##
## 256-bit specific part of key schedule.
##
## The structure here is very similar to the 128-bit
## schedule, but with an additional "low side" in
## %xmm6. The low side's rounds are the same as the
## high side's, except no rcon and no rotation.
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
&call ("_vpaes_schedule_mangle"); # output low result
&movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
# high round
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
&movdqa (&QWP(20,"esp"),"xmm7");
&movdqa ("xmm7","xmm6");
&call ("_vpaes_schedule_low_round");
&movdqa ("xmm7",&QWP(20,"esp"));
&jmp (&label("loop_schedule_256"));
##
## .aes_schedule_mangle_last
##
## Mangler for last round of key schedule
## Mangles %xmm0
## when encrypting, outputs out(%xmm0) ^ 63
## when decrypting, outputs unskew(%xmm0)
##
## Always called right before return... jumps to cleanup and exits
##
&set_label("schedule_mangle_last",16);
# schedule last round key from xmm0
&lea ($base,&DWP($k_deskew,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_last_dec"));
# encrypting
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm0","xmm1"); # output permute
&lea ($base,&DWP($k_opt,$const)); # prepare to output transform
&add ($key,32);
&set_label("schedule_mangle_last_dec");
&add ($key,-16);
&pxor ("xmm0",&QWP($k_s63,$const));
&call ("_vpaes_schedule_transform"); # output transform
&movdqu (&QWP(0,$key),"xmm0"); # save last key
# cleanup
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&ret ();
&function_end_B("_vpaes_schedule_core");
##
## .aes_schedule_192_smear
##
## Smear the short, low side in the 192-bit key schedule.
##
## Inputs:
## %xmm7: high side, b a x y
## %xmm6: low side, d c 0 0
## %xmm13: 0
##
## Outputs:
## %xmm6: b+c+d b+c 0 0
## %xmm0: b+c+d b+c b a
##
&function_begin_B("_vpaes_schedule_192_smear");
&pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0
&pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
&pxor ("xmm6","xmm1"); # -> c+d c 0 0
&pxor ("xmm1","xmm1");
&pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
&movdqa ("xmm0","xmm6");
&movhlps("xmm6","xmm1"); # clobber low side with zeros
&ret ();
&function_end_B("_vpaes_schedule_192_smear");
##
## .aes_schedule_round
##
## Runs one main round of the key schedule on %xmm0, %xmm7
##
## Specifically, runs subbytes on the high dword of %xmm0
## then rotates it by one byte and xors into the low dword of
## %xmm7.
##
## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
## next rcon.
##
## Smears the dwords of %xmm7 by xoring the low into the
## second low, result into third, result into highest.
##
## Returns results in %xmm7 = %xmm0.
## Clobbers %xmm1-%xmm5.
##
&function_begin_B("_vpaes_schedule_round");
# extract rcon from xmm8
&movdqa ("xmm2",&QWP(8,"esp")); # xmm8
&pxor ("xmm1","xmm1");
&palignr("xmm1","xmm2",15);
&palignr("xmm2","xmm2",15);
&pxor ("xmm7","xmm1");
# rotate
&pshufd ("xmm0","xmm0",0xFF);
&palignr("xmm0","xmm0",1);
# fall through...
&movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
# low round: same as high round, but no rotation and no rcon.
&set_label("_vpaes_schedule_low_round");
# smear xmm7
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",4);
&pxor ("xmm7","xmm1");
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",8);
&pxor ("xmm7","xmm1");
&pxor ("xmm7",&QWP($k_s63,$const));
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
&movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pshufb ("xmm2","xmm0"); # 2 = a/k
&pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm5"); # 3 : 1/i
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm5"); # 4 : 1/j
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm5"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm5"); # 3 : 1/jak
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&pxor ("xmm3","xmm1"); # 3 = jo
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
&pshufb ("xmm4","xmm2"); # 4 = sbou
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = sbox output
# add in smeared stuff
&pxor ("xmm0","xmm7");
&movdqa ("xmm7","xmm0");
&ret ();
&function_end_B("_vpaes_schedule_round");
##
## .aes_schedule_transform
##
## Linear-transform %xmm0 according to tables at (%ebx)
##
## Output in %xmm0
## Clobbers %xmm1, %xmm2
##
&function_begin_B("_vpaes_schedule_transform");
&movdqa ("xmm2",&QWP($k_s0F,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4);
&pand ("xmm0","xmm2");
&movdqa ("xmm2",&QWP(0,$base));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP(16,$base));
&pshufb ("xmm0","xmm1");
&pxor ("xmm0","xmm2");
&ret ();
&function_end_B("_vpaes_schedule_transform");
##
## .aes_schedule_mangle
##
## Mangle xmm0 from (basis-transformed) standard version
## to our version.
##
## On encrypt,
## xor with 0x63
## multiply by circulant 0,1,1,1
## apply shiftrows transform
##
## On decrypt,
## xor with 0x63
## multiply by "inverse mixcolumns" circulant E,B,D,9
## deskew
## apply shiftrows transform
##
##
## Writes out to (%edx), and increments or decrements it
## Keeps track of round number mod 4 in %ecx
## Preserves xmm0
## Clobbers xmm1-xmm5
##
&function_begin_B("_vpaes_schedule_mangle");
&movdqa ("xmm4","xmm0"); # save xmm0 for later
&movdqa ("xmm5",&QWP($k_mc_forward,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_dec"));
# encrypting
&add ($key,16);
&pxor ("xmm4",&QWP($k_s63,$const));
&pshufb ("xmm4","xmm5");
&movdqa ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&jmp (&label("schedule_mangle_both"));
&set_label("schedule_mangle_dec",16);
# inverse mix columns
&movdqa ("xmm2",&QWP($k_s0F,$const));
&lea ($inp,&DWP($k_dksd,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm4");
&psrld ("xmm1",4); # 1 = hi
&pand ("xmm4","xmm2"); # 4 = lo
&movdqa ("xmm2",&QWP(0,$inp));
&pshufb ("xmm2","xmm4");
&movdqa ("xmm3",&QWP(0x10,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x20,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x30,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x40,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x50,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x60,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x70,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&add ($key,-16);
&set_label("schedule_mangle_both");
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&add ($magic,-16);
&and ($magic,0x30);
&movdqu (&QWP(0,$key),"xmm3");
&ret ();
&function_end_B("_vpaes_schedule_mangle");
#
# Interface to OpenSSL
#
&function_begin("${PREFIX}_set_encrypt_key");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($round,&wparam(1)); # bits
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&mov ($base,$round);
&shr ($base,5);
&add ($base,5);
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
&mov ($magic,0x30);
&mov ($out,0);
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_schedule_core");
&set_label("pic_point");
&mov ("esp",&DWP(48,"esp"));
&xor ("eax","eax");
&function_end("${PREFIX}_set_encrypt_key");
&function_begin("${PREFIX}_set_decrypt_key");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($round,&wparam(1)); # bits
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&mov ($base,$round);
&shr ($base,5);
&add ($base,5);
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
&shl ($base,4);
&lea ($key,&DWP(16,$key,$base));
&mov ($out,1);
&mov ($magic,$round);
&shr ($magic,1);
&and ($magic,32);
&xor ($magic,32); # nbist==192?0:32;
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_schedule_core");
&set_label("pic_point");
&mov ("esp",&DWP(48,"esp"));
&xor ("eax","eax");
&function_end("${PREFIX}_set_decrypt_key");
&function_begin("${PREFIX}_encrypt");
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($out,&wparam(1)); # out
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&movdqu ("xmm0",&QWP(0,$inp));
&call ("_vpaes_encrypt_core");
&movdqu (&QWP(0,$out),"xmm0");
&mov ("esp",&DWP(48,"esp"));
&function_end("${PREFIX}_encrypt");
&function_begin("${PREFIX}_decrypt");
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($out,&wparam(1)); # out
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&movdqu ("xmm0",&QWP(0,$inp));
&call ("_vpaes_decrypt_core");
&movdqu (&QWP(0,$out),"xmm0");
&mov ("esp",&DWP(48,"esp"));
&function_end("${PREFIX}_decrypt");
&function_begin("${PREFIX}_cbc_encrypt");
&mov ($inp,&wparam(0)); # inp
&mov ($out,&wparam(1)); # out
&mov ($round,&wparam(2)); # len
&mov ($key,&wparam(3)); # key
&sub ($round,16);
&jc (&label("cbc_abort"));
&lea ($base,&DWP(-56,"esp"));
&mov ($const,&wparam(4)); # ivp
&and ($base,-16);
&mov ($magic,&wparam(5)); # enc
&xchg ($base,"esp"); # alloca
&movdqu ("xmm1",&QWP(0,$const)); # load IV
&sub ($out,$inp);
&mov (&DWP(48,"esp"),$base);
&mov (&DWP(0,"esp"),$out); # save out
&mov (&DWP(4,"esp"),$key) # save key
&mov (&DWP(8,"esp"),$const); # save ivp
&mov ($out,$round); # $out works as $len
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&cmp ($magic,0);
&je (&label("cbc_dec_loop"));
&jmp (&label("cbc_enc_loop"));
&set_label("cbc_enc_loop",16);
&movdqu ("xmm0",&QWP(0,$inp)); # load input
&pxor ("xmm0","xmm1"); # inp^=iv
&call ("_vpaes_encrypt_core");
&mov ($base,&DWP(0,"esp")); # restore out
&mov ($key,&DWP(4,"esp")); # restore key
&movdqa ("xmm1","xmm0");
&movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
&lea ($inp,&DWP(16,$inp));
&sub ($out,16);
&jnc (&label("cbc_enc_loop"));
&jmp (&label("cbc_done"));
&set_label("cbc_dec_loop",16);
&movdqu ("xmm0",&QWP(0,$inp)); # load input
&movdqa (&QWP(16,"esp"),"xmm1"); # save IV
&movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
&call ("_vpaes_decrypt_core");
&mov ($base,&DWP(0,"esp")); # restore out
&mov ($key,&DWP(4,"esp")); # restore key
&pxor ("xmm0",&QWP(16,"esp")); # out^=iv
&movdqa ("xmm1",&QWP(32,"esp")); # load next IV
&movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
&lea ($inp,&DWP(16,$inp));
&sub ($out,16);
&jnc (&label("cbc_dec_loop"));
&set_label("cbc_done");
&mov ($base,&DWP(8,"esp")); # restore ivp
&mov ("esp",&DWP(48,"esp"));
&movdqu (&QWP(0,$base),"xmm1"); # write IV
&set_label("cbc_abort");
&function_end("${PREFIX}_cbc_encrypt");
&asm_finish();

File diff suppressed because it is too large Load Diff