Initial Commit

This commit is contained in:
root
2017-02-25 23:55:24 +01:00
commit 1fe2e8ab62
4868 changed files with 1487355 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,460 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Even though
# loops are aggressively modulo-scheduled in respect to references to
# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
# scheduling "glitch," because uprofile(1) indicates uniform sample
# distribution, as if all instruction bundles execute in 1.5 cycles.
# Meaning that it could have been even faster, yet 12 cycles is ~60%
# better than gcc-generated code and ~80% than code generated by vendor
# compiler.
$cnt="v0"; # $0
$t0="t0";
$t1="t1";
$t2="t2";
$Thi0="t3"; # $4
$Tlo0="t4";
$Thi1="t5";
$Tlo1="t6";
$rem="t7"; # $8
#################
$Xi="a0"; # $16, input argument block
$Htbl="a1";
$inp="a2";
$len="a3";
$nlo="a4"; # $20
$nhi="a5";
$Zhi="t8";
$Zlo="t9";
$Xhi="t10"; # $24
$Xlo="t11";
$remp="t12";
$rem_4bit="AT"; # $28
{ my $N;
sub loop() {
$N++;
$code.=<<___;
.align 4
extbl $Xlo,7,$nlo
and $nlo,0xf0,$nhi
sll $nlo,4,$nlo
and $nlo,0xf0,$nlo
addq $nlo,$Htbl,$nlo
ldq $Zlo,8($nlo)
addq $nhi,$Htbl,$nhi
ldq $Zhi,0($nlo)
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
lda $cnt,6(zero)
extbl $Xlo,6,$nlo
ldq $Tlo1,8($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $Thi1,0($nhi)
srl $Zlo,4,$Zlo
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
and $nlo,0xf0,$nhi
xor $Tlo1,$Zlo,$Zlo
sll $nlo,4,$nlo
xor $Thi1,$Zhi,$Zhi
and $nlo,0xf0,$nlo
addq $nlo,$Htbl,$nlo
ldq $Tlo0,8($nlo)
addq $nhi,$Htbl,$nhi
ldq $Thi0,0($nlo)
.Looplo$N:
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
subq $cnt,1,$cnt
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
extbl $Xlo,$cnt,$nlo
and $nlo,0xf0,$nhi
xor $Thi0,$Zhi,$Zhi
xor $Tlo0,$Zlo,$Zlo
sll $nlo,4,$nlo
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
and $nlo,0xf0,$nlo
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
addq $nlo,$Htbl,$nlo
addq $nhi,$Htbl,$nhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
ldq $Tlo0,8($nlo)
xor $t0,$Zlo,$Zlo
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
ldq $Thi0,0($nlo)
bne $cnt,.Looplo$N
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
lda $cnt,7(zero)
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
extbl $Xhi,$cnt,$nlo
and $nlo,0xf0,$nhi
xor $Thi0,$Zhi,$Zhi
xor $Tlo0,$Zlo,$Zlo
sll $nlo,4,$nlo
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
and $nlo,0xf0,$nlo
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
addq $nlo,$Htbl,$nlo
addq $nhi,$Htbl,$nhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
ldq $Tlo0,8($nlo)
xor $t0,$Zlo,$Zlo
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
ldq $Thi0,0($nlo)
unop
.Loophi$N:
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
subq $cnt,1,$cnt
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
extbl $Xhi,$cnt,$nlo
and $nlo,0xf0,$nhi
xor $Thi0,$Zhi,$Zhi
xor $Tlo0,$Zlo,$Zlo
sll $nlo,4,$nlo
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
and $nlo,0xf0,$nlo
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
addq $nlo,$Htbl,$nlo
addq $nhi,$Htbl,$nhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
ldq $Tlo0,8($nlo)
xor $t0,$Zlo,$Zlo
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
ldq $Thi0,0($nlo)
bne $cnt,.Loophi$N
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
xor $Tlo0,$Zlo,$Zlo
xor $Thi0,$Zhi,$Zhi
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
xor $t0,$Zlo,$Zlo
xor $rem,$Zhi,$Zhi
___
}}
$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl gcm_gmult_4bit
.align 4
.ent gcm_gmult_4bit
gcm_gmult_4bit:
.frame sp,0,ra
.prologue 0
ldq $Xlo,8($Xi)
ldq $Xhi,0($Xi)
bsr $t0,picmeup
nop
___
&loop();
$code.=<<___;
srl $Zlo,24,$t0 # byte swap
srl $Zlo,8,$t1
sll $Zlo,8,$t2
sll $Zlo,24,$Zlo
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
zapnot $Zlo,0x88,$Zlo
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zlo,$t0,$Zlo
srl $Zhi,24,$t0
srl $Zhi,8,$t1
or $Zlo,$t2,$Zlo
sll $Zhi,8,$t2
sll $Zhi,24,$Zhi
srl $Zlo,32,$Xlo
sll $Zlo,32,$Zlo
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
or $Zlo,$Xlo,$Xlo
zapnot $Zhi,0x88,$Zhi
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zhi,$t0,$Zhi
or $Zhi,$t2,$Zhi
srl $Zhi,32,$Xhi
sll $Zhi,32,$Zhi
or $Zhi,$Xhi,$Xhi
stq $Xlo,8($Xi)
stq $Xhi,0($Xi)
ret (ra)
.end gcm_gmult_4bit
___
$inhi="s0";
$inlo="s1";
$code.=<<___;
.globl gcm_ghash_4bit
.align 4
.ent gcm_ghash_4bit
gcm_ghash_4bit:
lda sp,-32(sp)
stq ra,0(sp)
stq s0,8(sp)
stq s1,16(sp)
.mask 0x04000600,-32
.frame sp,32,ra
.prologue 0
ldq_u $inhi,0($inp)
ldq_u $Thi0,7($inp)
ldq_u $inlo,8($inp)
ldq_u $Tlo0,15($inp)
ldq $Xhi,0($Xi)
ldq $Xlo,8($Xi)
bsr $t0,picmeup
nop
.Louter:
extql $inhi,$inp,$inhi
extqh $Thi0,$inp,$Thi0
or $inhi,$Thi0,$inhi
lda $inp,16($inp)
extql $inlo,$inp,$inlo
extqh $Tlo0,$inp,$Tlo0
or $inlo,$Tlo0,$inlo
subq $len,16,$len
xor $Xlo,$inlo,$Xlo
xor $Xhi,$inhi,$Xhi
___
&loop();
$code.=<<___;
srl $Zlo,24,$t0 # byte swap
srl $Zlo,8,$t1
sll $Zlo,8,$t2
sll $Zlo,24,$Zlo
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
zapnot $Zlo,0x88,$Zlo
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zlo,$t0,$Zlo
srl $Zhi,24,$t0
srl $Zhi,8,$t1
or $Zlo,$t2,$Zlo
sll $Zhi,8,$t2
sll $Zhi,24,$Zhi
srl $Zlo,32,$Xlo
sll $Zlo,32,$Zlo
beq $len,.Ldone
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
or $Zlo,$Xlo,$Xlo
ldq_u $inhi,0($inp)
zapnot $Zhi,0x88,$Zhi
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
ldq_u $Thi0,7($inp)
or $Zhi,$t0,$Zhi
or $Zhi,$t2,$Zhi
ldq_u $inlo,8($inp)
ldq_u $Tlo0,15($inp)
srl $Zhi,32,$Xhi
sll $Zhi,32,$Zhi
or $Zhi,$Xhi,$Xhi
br zero,.Louter
.Ldone:
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
or $Zlo,$Xlo,$Xlo
zapnot $Zhi,0x88,$Zhi
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zhi,$t0,$Zhi
or $Zhi,$t2,$Zhi
srl $Zhi,32,$Xhi
sll $Zhi,32,$Zhi
or $Zhi,$Xhi,$Xhi
stq $Xlo,8($Xi)
stq $Xhi,0($Xi)
.set noreorder
/*ldq ra,0(sp)*/
ldq s0,8(sp)
ldq s1,16(sp)
lda sp,32(sp)
ret (ra)
.end gcm_ghash_4bit
.align 4
.ent picmeup
picmeup:
.frame sp,0,$t0
.prologue 0
br $rem_4bit,.Lpic
.Lpic: lda $rem_4bit,12($rem_4bit)
ret ($t0)
.end picmeup
nop
rem_4bit:
.long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
.long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
.long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
.long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;

View File

@@ -0,0 +1,498 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# April 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
# experimental performance data available yet. The only approximation
# that can be made at this point is based on code size. Inner loop is
# 32 instructions long and on single-issue core should execute in <40
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# February 2011
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Cortex A8 core and ~23.5 cycles per byte.
#
# March 2011
#
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
#
# April 2014
#
# Switch to multiplication algorithm suggested in paper referred
# below and combine it with reduction algorithm from x86 module.
# Performance improvement over previous version varies from 65% on
# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
# in 9.33.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
# - performance improvement won't be anywhere near 50%, because 128-
# bit shift operation is neatly fused with 128-bit xor here, and
# "538B" variant would eliminate only 4-5 instructions out of 32
# in the inner loop (meaning that estimated improvement is ~15%);
# - ARM-based systems are often embedded ones and extra memory
# consumption might be unappreciated (for so little improvement);
#
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in Htable,
# namely with *least* significant dword of 128-bit value at *lower*
# address. This differs completely from C code and has everything to
# do with ldm instruction and order in which dwords are "consumed" by
# algorithm. *Byte* order within these dwords in turn is whatever
# *native* byte order on current platform. See gcm128.c for working
# example...
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$Xi="r0"; # argument block
$Htbl="r1";
$inp="r2";
$len="r3";
$Zll="r4"; # variables
$Zlh="r5";
$Zhl="r6";
$Zhh="r7";
$Tll="r8";
$Tlh="r9";
$Thl="r10";
$Thh="r11";
$nlo="r12";
################# r13 is stack pointer
$nhi="r14";
################# r15 is program counter
$rem_4bit=$inp; # used in gcm_gmult_4bit
$cnt=$len;
sub Zsmash() {
my $i=12;
my @args=@_;
for ($Zll,$Zlh,$Zhl,$Zhh) {
$code.=<<___;
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev $_,$_
str $_,[$Xi,#$i]
#elif defined(__ARMEB__)
str $_,[$Xi,#$i]
#else
mov $Tlh,$_,lsr#8
strb $_,[$Xi,#$i+3]
mov $Thl,$_,lsr#16
strb $Tlh,[$Xi,#$i+2]
mov $Thh,$_,lsr#24
strb $Thl,[$Xi,#$i+1]
strb $Thh,[$Xi,#$i]
#endif
___
$code.="\t".shift(@args)."\n";
$i-=4;
}
}
$code=<<___;
#include "arm_arch.h"
.text
.code 32
#ifdef __clang__
#define ldrplb ldrbpl
#define ldrneb ldrbne
#endif
.type rem_4bit,%object
.align 5
rem_4bit:
.short 0x0000,0x1C20,0x3840,0x2460
.short 0x7080,0x6CA0,0x48C0,0x54E0
.short 0xE100,0xFD20,0xD940,0xC560
.short 0x9180,0x8DA0,0xA9C0,0xB5E0
.size rem_4bit,.-rem_4bit
.type rem_4bit_get,%function
rem_4bit_get:
sub $rem_4bit,pc,#8
sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
b .Lrem_4bit_got
nop
.size rem_4bit_get,.-rem_4bit_get
.global gcm_ghash_4bit
.type gcm_ghash_4bit,%function
gcm_ghash_4bit:
sub r12,pc,#8
add $len,$inp,$len @ $len to point at the end
stmdb sp!,{r3-r11,lr} @ save $len/end too
sub r12,r12,#48 @ &rem_4bit
ldmia r12,{r4-r11} @ copy rem_4bit ...
stmdb sp!,{r4-r11} @ ... to stack
ldrb $nlo,[$inp,#15]
ldrb $nhi,[$Xi,#15]
.Louter:
eor $nlo,$nlo,$nhi
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
mov $cnt,#14
add $Zhh,$Htbl,$nlo,lsl#4
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
add $Thh,$Htbl,$nhi
ldrb $nlo,[$inp,#14]
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
ldrb $nhi,[$Xi,#14]
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
eor $nlo,$nlo,$nhi
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16
.Linner:
add $Thh,$Htbl,$nlo,lsl#4
and $nlo,$Zll,#0xf @ rem
subs $cnt,$cnt,#1
add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr#4
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4
ldrplb $nlo,[$inp,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4
ldrplb $Tll,[$Xi,$cnt]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
ldrh $Tlh,[sp,$nhi]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eorpl $nlo,$nlo,$Tll
eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
bpl .Linner
ldr $len,[sp,#32] @ re-load $len/end
add $inp,$inp,#16
mov $nhi,$Zll
___
&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
$code.=<<___;
bne .Louter
add sp,sp,#36
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size gcm_ghash_4bit,.-gcm_ghash_4bit
.global gcm_gmult_4bit
.type gcm_gmult_4bit,%function
gcm_gmult_4bit:
stmdb sp!,{r4-r11,lr}
ldrb $nlo,[$Xi,#15]
b rem_4bit_get
.Lrem_4bit_got:
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
mov $cnt,#14
add $Zhh,$Htbl,$nlo,lsl#4
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
ldrb $nlo,[$Xi,#14]
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
and $nhi,$nlo,#0xf0
eor $Zhh,$Zhh,$Tll,lsl#16
and $nlo,$nlo,#0x0f
.Loop:
add $Thh,$Htbl,$nlo,lsl#4
and $nlo,$Zll,#0xf @ rem
subs $cnt,$cnt,#1
add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr#4
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4
ldrplb $nlo,[$Xi,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop
___
&Zsmash();
$code.=<<___;
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
sub clmul64x64 {
my ($r,$a,$b)=@_;
$code.=<<___;
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
___
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global gcm_init_neon
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
vld1.64 $IN#hi,[r1,:64]! @ load H
vmov.i8 $t0,#0xe1
vld1.64 $IN#lo,[r1,:64]
vshl.i64 $t0#hi,#57
vshr.u64 $t0#lo,#63 @ t0=0xc2....01
vdup.8 $t1,$IN#hi[7]
vshr.u64 $Hlo,$IN#lo,#63
vshr.s8 $t1,#7 @ broadcast carry bit
vshl.i64 $IN,$IN,#1
vand $t0,$t0,$t1
vorr $IN#hi,$Hlo @ H<<<=1
veor $IN,$IN,$t0 @ twisted H
vstmia r0,{$IN}
ret @ bx lr
.size gcm_init_neon,.-gcm_init_neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
vld1.64 $IN#hi,[$Xi,:64]! @ load Xi
vld1.64 $IN#lo,[$Xi,:64]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
vmov.i64 $k16,#0x000000000000ffff
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
mov $len,#16
b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi
vld1.64 $Xl#lo,[$Xi,:64]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
vmov.i64 $k16,#0x000000000000ffff
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
.Loop_neon:
vld1.64 $IN#hi,[$inp]! @ load inp
vld1.64 $IN#lo,[$inp]!
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $IN,$Xl @ inp^=Xi
.Lgmult_neon:
___
&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
$code.=<<___;
veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
___
&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
$code.=<<___;
veor $Xm,$Xm,$Xl @ Karatsuba post-processing
veor $Xm,$Xm,$Xh
veor $Xl#hi,$Xl#hi,$Xm#lo
veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
@ equivalent of reduction_avx from ghash-x86_64.pl
vshl.i64 $t1,$Xl,#57 @ 1st phase
vshl.i64 $t2,$Xl,#62
veor $t2,$t2,$t1 @
vshl.i64 $t1,$Xl,#63
veor $t2, $t2, $t1 @
veor $Xl#hi,$Xl#hi,$t2#lo @
veor $Xh#lo,$Xh#lo,$t2#hi
vshr.u64 $t2,$Xl,#1 @ 2nd phase
veor $Xh,$Xh,$Xl
veor $Xl,$Xl,$t2 @
vshr.u64 $t2,$t2,#6
vshr.u64 $Xl,$Xl,#1 @
veor $Xl,$Xl,$Xh @
veor $Xl,$Xl,$t2 @
subs $len,#16
bne .Loop_neon
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
vst1.64 $Xl#lo,[$Xi,:64]
ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush

View File

@@ -0,0 +1,463 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
# GHASH performance was measured to be 6.67 cycles per processed byte
# on Itanium 2, which is >90% better than Microsoft compiler generated
# code. To anchor to something else sha1-ia64.pl module processes one
# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
# byte.
# September 2010
#
# It was originally thought that it makes lesser sense to implement
# "528B" variant on Itanium 2 for following reason. Because number of
# functional units is naturally limited, it appeared impossible to
# implement "528B" loop in 4 cycles, only in 5. This would mean that
# theoretically performance improvement couldn't be more than 20%.
# But occasionally you prove yourself wrong:-) I figured out a way to
# fold couple of instructions and having freed yet another instruction
# slot by unrolling the loop... Resulting performance is 4.45 cycles
# per processed byte and 50% better than "256B" version. On original
# Itanium performance should remain the same as the "256B" version,
# i.e. ~8.5 cycles.
$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
$big_endian=0 if (/\-DL_ENDIAN/); }
if (!defined($big_endian))
{ $big_endian=(unpack('L',pack('N',1))==1); }
sub loop() {
my $label=shift;
my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
# in scalable manner;-) Naturally assuming data in L1 cache...
# Special note about 'dep' instruction, which is used to construct
# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
# bytes boundary and lower 7 bits of its address are guaranteed to
# be zero.
$code.=<<___;
$label:
{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
(p19) dep rem=Zlo,rem_4bitp,3,4 }
{ .mfi; (p19) xor Zhi=Zhi,Hhi
($p17) xor xi[1]=xi[1],in[1] };;
{ .mfi; (p18) ld8 Hhi=[Hi[1]]
(p19) shrp Zlo=Zhi,Zlo,4 }
{ .mfi; (p19) ld8 rem=[rem]
(p18) and Hi[1]=mask0xf0,xi[2] };;
{ .mmi; ($p16) ld1 in[0]=[inp],-1
(p18) xor Zlo=Zlo,Hlo
(p19) shr.u Zhi=Zhi,4 }
{ .mib; (p19) xor Hhi=Hhi,rem
(p18) add Hi[1]=Htbl,Hi[1] };;
{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
(p18) dep rem=Zlo,rem_4bitp,3,4 }
{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
(p18) xor Zhi=Zhi,Hhi };;
{ .mfi; (p18) ld8 Hhi=[Hi[1]]
(p18) shrp Zlo=Zhi,Zlo,4 }
{ .mfi; (p18) ld8 rem=[rem]
(p17) and Hi[0]=mask0xf0,Hi[0] };;
{ .mmi; (p16) ld1 xi[0]=[Xi],-1
(p18) xor Zlo=Zlo,Hlo
(p18) shr.u Zhi=Zhi,4 }
{ .mib; (p18) xor Hhi=Hhi,rem
(p17) add Hi[0]=Htbl,Hi[0]
br.ctop.sptk $label };;
___
}
$code=<<___;
.explicit
.text
prevfs=r2; prevlc=r3; prevpr=r8;
mask0xf0=r21;
rem=r22; rem_4bitp=r23;
Xi=r24; Htbl=r25;
inp=r26; end=r27;
Hhi=r28; Hlo=r29;
Zhi=r30; Zlo=r31;
.align 128
.skip 16 // aligns loop body
.global gcm_gmult_4bit#
.proc gcm_gmult_4bit#
gcm_gmult_4bit:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,2,6,0,8
$ADDP Xi=15,in0 // &Xi[15]
mov rem_4bitp=ip }
{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
.save ar.lc,prevlc
mov prevlc=ar.lc
.save pr,prevpr
mov prevpr=pr };;
.body
.rotr in[3],xi[3],Hi[2]
{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
mov mask0xf0=0xf0
brp.loop.imp .Loop1,.Lend1-16};;
{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
};;
{ .mii; shladd Hi[1]=xi[2],4,r0
mov pr.rot=0x7<<16
mov ar.lc=13 };;
{ .mii; and Hi[1]=mask0xf0,Hi[1]
mov ar.ec=3
xor Zlo=Zlo,Zlo };;
{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
xor Zhi=Zhi,Zhi };;
___
&loop (".Loop1",1);
$code.=<<___;
.Lend1:
{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
{ .mib; mux1 Zlo=Zlo,\@rev };;
{ .mib; mux1 Zhi=Zhi,\@rev };;
{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
add Hhi=1,Xi };; // pipeline flush on Itanium
{ .mib; st8 [Hlo]=Zlo
mov pr=prevpr,0x1ffff };;
{ .mib; st8 [Hhi]=Zhi
mov ar.lc=prevlc
br.ret.sptk.many b0 };;
.endp gcm_gmult_4bit#
___
######################################################################
# "528B" (well, "512B" actualy) streamed GHASH
#
$Xip="in0";
$Htbl="in1";
$inp="in2";
$len="in3";
$rem_8bit="loc0";
$mask0xff="loc1";
($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
sub load_htable() {
for (my $i=0;$i<8;$i++) {
$code.=<<___;
{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
___
$code.=shift if (($i+$#_)==7);
$code.="\t};;\n"
}
}
$code.=<<___;
prevsp=r3;
.align 32
.skip 16 // aligns loop body
.global gcm_ghash_4bit#
.proc gcm_ghash_4bit#
gcm_ghash_4bit:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,4,2,0,0
.vframe prevsp
mov prevsp=sp
mov $rem_8bit=ip };;
.body
{ .mfi; $ADDP r8=0+0,$Htbl
$ADDP r9=0+8,$Htbl }
{ .mfi; $ADDP r10=128+0,$Htbl
$ADDP r11=128+8,$Htbl };;
___
&load_htable(
" $ADDP $Xip=15,$Xip", # &Xi[15]
" $ADDP $len=$len,$inp", # &inp[len]
" $ADDP $inp=15,$inp", # &inp[15]
" mov $mask0xff=0xff",
" add sp=-512,sp",
" andcm sp=sp,$mask0xff", # align stack frame
" add r14=0,sp",
" add r15=8,sp");
$code.=<<___;
{ .mmi; $sum 1<<1 // go big-endian
add r8=256+0,sp
add r9=256+8,sp }
{ .mmi; add r10=256+128+0,sp
add r11=256+128+8,sp
add $len=-17,$len };;
___
for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
$code.=<<___;
{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
st8 [r9]=$rhi,16 // Htable[$i].hi
shrp $rlo=$rhi,$rlo,4 }//;;
{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
shr.u $rhi=$rhi,4 };;
{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
___
}
$code.=<<___;
{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
ld8 r17=[r9],16 };; // Htable[8].hi
{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
ld8 r19=[r9],16 } // Htable[9].hi
{ .mmi; rum 1<<5 // clear um.mfh
shrp r16=r17,r16,4 };;
___
for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
$code.=<<___;
{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
___
}
$code.=<<___;
{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
{ .mmi; add $Htbl=256,sp // &Htable[0]
add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
___
$in="r15";
@xi=("r16","r17");
@rem=("r18","r19");
($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
($Atbl,$Btbl)=("r26","r27");
$code.=<<___; # (p16)
{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
cmp.eq p0,p6=r0,r0 };; // clear p6
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
$code.=<<___; # (p16),(p17)
{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
.align 32
.LOOP:
{ .mmi;
(p6) st8 [$Xip]=$Zhi,13
xor $Zlo=$Zlo,$Zlo
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
$code.=<<___; # (p16),(p17),(p18)
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
ld1 $in=[$inp],-1 } //(p16) *inp--
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
for ($i=1;$i<14;$i++) {
# Above and below fragments are derived from this one by removing
# unsuitable (p??) instructions.
$code.=<<___; # (p16),(p17),(p18),(p19)
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
ld1 $in=[$inp],-1 //(p16) *inp--
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
}
$code.=<<___; # (p17),(p18),(p19)
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
$code.=<<___; # (p18),(p19)
{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
$code.=<<___; # (p19)
{ .mmi; cmp.ltu p6,p0=$inp,$len
add $inp=32,$inp
shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
add $Xip=9,$Xip };; // &Xi.lo
{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
(p6) ld1 $in=[$inp],-1 //[p16] *inp--
(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
{ .mmi; st8 [$Xip]=$Zlo,-8
(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
{ .mmi;
(p6) ld1 $in=[$inp],-1 //[p16] *inp--
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
{ .mib;
(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
(p6) br.cond.dptk.many .LOOP };;
{ .mib; st8 [$Xip]=$Zhi };;
{ .mib; $rum 1<<1 // return to little-endian
.restore sp
mov sp=prevsp
br.ret.sptk.many b0 };;
.endp gcm_ghash_4bit#
___
$code.=<<___;
.align 128
.type rem_4bit#,\@object
rem_4bit:
data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
.size rem_4bit#,128
.type rem_8bit#,\@object
rem_8bit:
data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
.size rem_8bit#,512
stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,731 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# April 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
# it processes one byte in 19.6 cycles, which is more than twice as
# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
# processed byte. This is ~2.2x faster than 64-bit code generated by
# vendor compiler (which used to be very hard to beat:-).
#
# Special thanks to polarhome.com for providing HP-UX account.
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
$NREGS =6;
} else {
$LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
$NREGS =11;
}
$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
# [+ argument transfer]
################# volatile registers
$Xi="%r26"; # argument block
$Htbl="%r25";
$inp="%r24";
$len="%r23";
$Hhh=$Htbl; # variables
$Hll="%r22";
$Zhh="%r21";
$Zll="%r20";
$cnt="%r19";
$rem_4bit="%r28";
$rem="%r29";
$mask0xf0="%r31";
################# preserved registers
$Thh="%r1";
$Tll="%r2";
$nlo="%r3";
$nhi="%r4";
$byte="%r5";
if ($SIZE_T==4) {
$Zhl="%r6";
$Zlh="%r7";
$Hhl="%r8";
$Hlh="%r9";
$Thl="%r10";
$Tlh="%r11";
}
$rem2="%r6"; # used in PA-RISC 2.0 code
$code.=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
.ALIGN 64
gcm_gmult_4bit
.PROC
.CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
___
$code.=<<___ if ($SIZE_T==4);
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
___
$code.=<<___;
blr %r0,$rem_4bit
ldi 3,$rem
L\$pic_gmult
andcm $rem_4bit,$rem,$rem_4bit
addl $inp,$len,$len
ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
ldi 0xf0,$mask0xf0
___
$code.=<<___ if ($SIZE_T==4);
ldi 31,$rem
mtctl $rem,%cr11
extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
b L\$parisc1_gmult
nop
___
$code.=<<___;
ldb 15($Xi),$nlo
ldo 8($Htbl),$Hll
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
ldd $nlo($Hll),$Zll
ldd $nlo($Hhh),$Zhh
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldb 14($Xi),$nlo
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
b L\$oop_gmult_pa2
ldi 13,$cnt
.ALIGN 8
L\$oop_gmult_pa2
xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nlo($Hll),$Tll
ldd $nlo($Hhh),$Thh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
xor $rem,$Zhh,$Zhh
depd,z $Zll,60,4,$rem
ldbx $cnt($Xi),$nlo
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
ldd $rem($rem_4bit),$rem
xor $Tll,$Zll,$Zll
addib,uv -1,$cnt,L\$oop_gmult_pa2
xor $Thh,$Zhh,$Zhh
xor $rem,$Zhh,$Zhh
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nlo($Hll),$Tll
ldd $nlo($Hhh),$Thh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
xor $rem,$Zhh,$Zhh
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
xor $rem,$Zhh,$Zhh
std $Zll,8($Xi)
std $Zhh,0($Xi)
___
$code.=<<___ if ($SIZE_T==4);
b L\$done_gmult
nop
L\$parisc1_gmult
ldb 15($Xi),$nlo
ldo 12($Htbl),$Hll
ldo 8($Htbl),$Hlh
ldo 4($Htbl),$Hhl
and $mask0xf0,$nlo,$nhi
zdep $nlo,27,4,$nlo
ldwx $nlo($Hll),$Zll
ldwx $nlo($Hlh),$Zlh
ldwx $nlo($Hhl),$Zhl
ldwx $nlo($Hhh),$Zhh
zdep $Zll,28,4,$rem
ldb 14($Xi),$nlo
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhl,$Zlh,4,$Zlh
ldwx $nhi($Hlh),$Tlh
shrpw $Zhh,$Zhl,4,$Zhl
ldwx $nhi($Hhl),$Thl
extru $Zhh,27,28,$Zhh
ldwx $nhi($Hhh),$Thh
xor $rem,$Zhh,$Zhh
and $mask0xf0,$nlo,$nhi
zdep $nlo,27,4,$nlo
xor $Tll,$Zll,$Zll
ldwx $nlo($Hll),$Tll
xor $Tlh,$Zlh,$Zlh
ldwx $nlo($Hlh),$Tlh
xor $Thl,$Zhl,$Zhl
b L\$oop_gmult_pa1
ldi 13,$cnt
.ALIGN 8
L\$oop_gmult_pa1
zdep $Zll,28,4,$rem
ldwx $nlo($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nlo($Hhh),$Thh
shrpw $Zhl,$Zlh,4,$Zlh
ldbx $cnt($Xi),$nlo
xor $Tll,$Zll,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhh,$Zhl,4,$Zhl
xor $Tlh,$Zlh,$Zlh
ldwx $nhi($Hlh),$Tlh
extru $Zhh,27,28,$Zhh
xor $Thl,$Zhl,$Zhl
ldwx $nhi($Hhl),$Thl
xor $rem,$Zhh,$Zhh
zdep $Zll,28,4,$rem
xor $Thh,$Zhh,$Zhh
ldwx $nhi($Hhh),$Thh
shrpw $Zlh,$Zll,4,$Zll
ldwx $rem($rem_4bit),$rem
shrpw $Zhl,$Zlh,4,$Zlh
shrpw $Zhh,$Zhl,4,$Zhl
and $mask0xf0,$nlo,$nhi
extru $Zhh,27,28,$Zhh
zdep $nlo,27,4,$nlo
xor $Tll,$Zll,$Zll
ldwx $nlo($Hll),$Tll
xor $Tlh,$Zlh,$Zlh
ldwx $nlo($Hlh),$Tlh
xor $rem,$Zhh,$Zhh
addib,uv -1,$cnt,L\$oop_gmult_pa1
xor $Thl,$Zhl,$Zhl
zdep $Zll,28,4,$rem
ldwx $nlo($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nlo($Hhh),$Thh
shrpw $Zhl,$Zlh,4,$Zlh
xor $Tll,$Zll,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhh,$Zhl,4,$Zhl
xor $Tlh,$Zlh,$Zlh
ldwx $nhi($Hlh),$Tlh
extru $Zhh,27,28,$Zhh
xor $rem,$Zhh,$Zhh
xor $Thl,$Zhl,$Zhl
ldwx $nhi($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $nhi($Hhh),$Thh
zdep $Zll,28,4,$rem
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
shrpw $Zhl,$Zlh,4,$Zlh
shrpw $Zhh,$Zhl,4,$Zhl
extru $Zhh,27,28,$Zhh
xor $Tll,$Zll,$Zll
xor $Tlh,$Zlh,$Zlh
xor $rem,$Zhh,$Zhh
stw $Zll,12($Xi)
xor $Thl,$Zhl,$Zhl
stw $Zlh,8($Xi)
xor $Thh,$Zhh,$Zhh
stw $Zhl,4($Xi)
stw $Zhh,0($Xi)
___
$code.=<<___;
L\$done_gmult
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
___
$code.=<<___ if ($SIZE_T==4);
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
___
$code.=<<___;
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
.ALIGN 64
gcm_ghash_4bit
.PROC
.CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
___
$code.=<<___ if ($SIZE_T==4);
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
___
$code.=<<___;
blr %r0,$rem_4bit
ldi 3,$rem
L\$pic_ghash
andcm $rem_4bit,$rem,$rem_4bit
addl $inp,$len,$len
ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
ldi 0xf0,$mask0xf0
___
$code.=<<___ if ($SIZE_T==4);
ldi 31,$rem
mtctl $rem,%cr11
extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
b L\$parisc1_ghash
nop
___
$code.=<<___;
ldb 15($Xi),$nlo
ldo 8($Htbl),$Hll
L\$outer_ghash_pa2
ldb 15($inp),$nhi
xor $nhi,$nlo,$nlo
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
ldd $nlo($Hll),$Zll
ldd $nlo($Hhh),$Zhh
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldb 14($Xi),$nlo
ldb 14($inp),$byte
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
xor $byte,$nlo,$nlo
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
b L\$oop_ghash_pa2
ldi 13,$cnt
.ALIGN 8
L\$oop_ghash_pa2
xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
depd,z $Zll,60,4,$rem2
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nlo($Hll),$Tll
ldd $nlo($Hhh),$Thh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldbx $cnt($Xi),$nlo
ldbx $cnt($inp),$byte
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
ldd $rem2($rem_4bit),$rem2
xor $rem2,$Zhh,$Zhh
xor $byte,$nlo,$nlo
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
extrd,u $Zhh,59,60,$Zhh
xor $Tll,$Zll,$Zll
ldd $rem($rem_4bit),$rem
addib,uv -1,$cnt,L\$oop_ghash_pa2
xor $Thh,$Zhh,$Zhh
xor $rem,$Zhh,$Zhh
depd,z $Zll,60,4,$rem2
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nlo($Hll),$Tll
ldd $nlo($Hhh),$Thh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
ldd $rem2($rem_4bit),$rem2
xor $rem2,$Zhh,$Zhh
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
extrd,u $Zhh,59,60,$Zhh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
xor $rem,$Zhh,$Zhh
std $Zll,8($Xi)
ldo 16($inp),$inp
std $Zhh,0($Xi)
cmpb,*<> $inp,$len,L\$outer_ghash_pa2
copy $Zll,$nlo
___
$code.=<<___ if ($SIZE_T==4);
b L\$done_ghash
nop
L\$parisc1_ghash
ldb 15($Xi),$nlo
ldo 12($Htbl),$Hll
ldo 8($Htbl),$Hlh
ldo 4($Htbl),$Hhl
L\$outer_ghash_pa1
ldb 15($inp),$byte
xor $byte,$nlo,$nlo
and $mask0xf0,$nlo,$nhi
zdep $nlo,27,4,$nlo
ldwx $nlo($Hll),$Zll
ldwx $nlo($Hlh),$Zlh
ldwx $nlo($Hhl),$Zhl
ldwx $nlo($Hhh),$Zhh
zdep $Zll,28,4,$rem
ldb 14($Xi),$nlo
ldb 14($inp),$byte
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhl,$Zlh,4,$Zlh
ldwx $nhi($Hlh),$Tlh
shrpw $Zhh,$Zhl,4,$Zhl
ldwx $nhi($Hhl),$Thl
extru $Zhh,27,28,$Zhh
ldwx $nhi($Hhh),$Thh
xor $byte,$nlo,$nlo
xor $rem,$Zhh,$Zhh
and $mask0xf0,$nlo,$nhi
zdep $nlo,27,4,$nlo
xor $Tll,$Zll,$Zll
ldwx $nlo($Hll),$Tll
xor $Tlh,$Zlh,$Zlh
ldwx $nlo($Hlh),$Tlh
xor $Thl,$Zhl,$Zhl
b L\$oop_ghash_pa1
ldi 13,$cnt
.ALIGN 8
L\$oop_ghash_pa1
zdep $Zll,28,4,$rem
ldwx $nlo($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nlo($Hhh),$Thh
shrpw $Zhl,$Zlh,4,$Zlh
ldbx $cnt($Xi),$nlo
xor $Tll,$Zll,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhh,$Zhl,4,$Zhl
ldbx $cnt($inp),$byte
xor $Tlh,$Zlh,$Zlh
ldwx $nhi($Hlh),$Tlh
extru $Zhh,27,28,$Zhh
xor $Thl,$Zhl,$Zhl
ldwx $nhi($Hhl),$Thl
xor $rem,$Zhh,$Zhh
zdep $Zll,28,4,$rem
xor $Thh,$Zhh,$Zhh
ldwx $nhi($Hhh),$Thh
shrpw $Zlh,$Zll,4,$Zll
ldwx $rem($rem_4bit),$rem
shrpw $Zhl,$Zlh,4,$Zlh
xor $byte,$nlo,$nlo
shrpw $Zhh,$Zhl,4,$Zhl
and $mask0xf0,$nlo,$nhi
extru $Zhh,27,28,$Zhh
zdep $nlo,27,4,$nlo
xor $Tll,$Zll,$Zll
ldwx $nlo($Hll),$Tll
xor $Tlh,$Zlh,$Zlh
ldwx $nlo($Hlh),$Tlh
xor $rem,$Zhh,$Zhh
addib,uv -1,$cnt,L\$oop_ghash_pa1
xor $Thl,$Zhl,$Zhl
zdep $Zll,28,4,$rem
ldwx $nlo($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nlo($Hhh),$Thh
shrpw $Zhl,$Zlh,4,$Zlh
xor $Tll,$Zll,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhh,$Zhl,4,$Zhl
xor $Tlh,$Zlh,$Zlh
ldwx $nhi($Hlh),$Tlh
extru $Zhh,27,28,$Zhh
xor $rem,$Zhh,$Zhh
xor $Thl,$Zhl,$Zhl
ldwx $nhi($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $nhi($Hhh),$Thh
zdep $Zll,28,4,$rem
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
shrpw $Zhl,$Zlh,4,$Zlh
shrpw $Zhh,$Zhl,4,$Zhl
extru $Zhh,27,28,$Zhh
xor $Tll,$Zll,$Zll
xor $Tlh,$Zlh,$Zlh
xor $rem,$Zhh,$Zhh
stw $Zll,12($Xi)
xor $Thl,$Zhl,$Zhl
stw $Zlh,8($Xi)
xor $Thh,$Zhh,$Zhh
stw $Zhl,4($Xi)
ldo 16($inp),$inp
stw $Zhh,0($Xi)
comb,<> $inp,$len,L\$outer_ghash_pa1
copy $Zll,$nlo
___
$code.=<<___;
L\$done_ghash
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
___
$code.=<<___ if ($SIZE_T==4);
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
___
$code.=<<___;
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.ALIGN 64
L\$rem_4bit
.WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
.WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
.WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
.WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
.ALIGN 64
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
{ my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
{ sprintf "\t.WORD\t0x%08x\t; %s",
(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
}
else { "\t".$orig; }
};
my $depd = sub {
my ($mod,$args) = @_;
my $orig = "depd$mod\t$args";
# I only have ",z" completer, it's impicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
{ my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
my $cpos=63-$2;
my $len=32-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
if ($SIZE_T==4) {
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
s/cmpb,\*/comb,/;
s/,\*/,/;
}
s/\bbv\b/bve/ if ($SIZE_T==8);
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,262 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# September 2010.
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Performance
# was measured to be ~18 cycles per processed byte on z10, which is
# almost 40% better than gcc-generated code. It should be noted that
# 18 cycles is worse result than expected: loop is scheduled for 12
# and the result should be close to 12. In the lack of instruction-
# level profiling data it's impossible to tell why...
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. On z990 it was measured to perform
# 2.8x better than 32-bit code generated by gcc 4.3.
# March 2011.
#
# Support for hardware KIMD-GHASH is verified to produce correct
# result and therefore is engaged. On z196 it was measured to process
# 8KB buffer ~7 faster than software implementation. It's not as
# impressive for smaller buffer sizes and for smallest 16-bytes buffer
# it's actually almost 2 times slower. Which is the reason why
# KIMD-GHASH is not used in gcm_gmult_4bit.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$softonly=0;
$Zhi="%r0";
$Zlo="%r1";
$Xi="%r2"; # argument block
$Htbl="%r3";
$inp="%r4";
$len="%r5";
$rem0="%r6"; # variables
$rem1="%r7";
$nlo="%r8";
$nhi="%r9";
$xi="%r10";
$cnt="%r11";
$tmp="%r12";
$x78="%r13";
$rem_4bit="%r14";
$sp="%r15";
$code.=<<___;
.text
.globl gcm_gmult_4bit
.align 32
gcm_gmult_4bit:
___
$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
larl %r1,OPENSSL_s390xcap_P
lg %r0,0(%r1)
tmhl %r0,0x4000 # check for message-security-assist
jz .Lsoft_gmult
lghi %r0,0
la %r1,16($sp)
.long 0xb93e0004 # kimd %r0,%r4
lg %r1,24($sp)
tmhh %r1,0x4000 # check for function 65
jz .Lsoft_gmult
stg %r0,16($sp) # arrange 16 bytes of zero input
stg %r0,24($sp)
lghi %r0,65 # function 65
la %r1,0($Xi) # H lies right after Xi in gcm128_context
la $inp,16($sp)
lghi $len,16
.long 0xb93e0004 # kimd %r0,$inp
brc 1,.-4 # pay attention to "partial completion"
br %r14
.align 32
.Lsoft_gmult:
___
$code.=<<___;
stm${g} %r6,%r14,6*$SIZE_T($sp)
aghi $Xi,-1
lghi $len,1
lghi $x78,`0xf<<3`
larl $rem_4bit,rem_4bit
lg $Zlo,8+1($Xi) # Xi
j .Lgmult_shortcut
.type gcm_gmult_4bit,\@function
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
.globl gcm_ghash_4bit
.align 32
gcm_ghash_4bit:
___
$code.=<<___ if(!$softonly);
larl %r1,OPENSSL_s390xcap_P
lg %r0,0(%r1)
tmhl %r0,0x4000 # check for message-security-assist
jz .Lsoft_ghash
lghi %r0,0
la %r1,16($sp)
.long 0xb93e0004 # kimd %r0,%r4
lg %r1,24($sp)
tmhh %r1,0x4000 # check for function 65
jz .Lsoft_ghash
lghi %r0,65 # function 65
la %r1,0($Xi) # H lies right after Xi in gcm128_context
.long 0xb93e0004 # kimd %r0,$inp
brc 1,.-4 # pay attention to "partial completion"
br %r14
.align 32
.Lsoft_ghash:
___
$code.=<<___ if ($flavour =~ /3[12]/);
llgfr $len,$len
___
$code.=<<___;
stm${g} %r6,%r14,6*$SIZE_T($sp)
aghi $Xi,-1
srlg $len,$len,4
lghi $x78,`0xf<<3`
larl $rem_4bit,rem_4bit
lg $Zlo,8+1($Xi) # Xi
lg $Zhi,0+1($Xi)
lghi $tmp,0
.Louter:
xg $Zhi,0($inp) # Xi ^= inp
xg $Zlo,8($inp)
xgr $Zhi,$tmp
stg $Zlo,8+1($Xi)
stg $Zhi,0+1($Xi)
.Lgmult_shortcut:
lghi $tmp,0xf0
sllg $nlo,$Zlo,4
srlg $xi,$Zlo,8 # extract second byte
ngr $nlo,$tmp
lgr $nhi,$Zlo
lghi $cnt,14
ngr $nhi,$tmp
lg $Zlo,8($nlo,$Htbl)
lg $Zhi,0($nlo,$Htbl)
sllg $nlo,$xi,4
sllg $rem0,$Zlo,3
ngr $nlo,$tmp
ngr $rem0,$x78
ngr $xi,$tmp
sllg $tmp,$Zhi,60
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nhi,$Htbl)
xg $Zhi,0($nhi,$Htbl)
lgr $nhi,$xi
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
sllg $tmp,$Zhi,60
j .Lghash_inner
.align 16
.Lghash_inner:
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nlo,$Htbl)
llgc $xi,0($cnt,$Xi)
xg $Zhi,0($nlo,$Htbl)
sllg $nlo,$xi,4
xg $Zhi,0($rem0,$rem_4bit)
nill $nlo,0xf0
sllg $rem0,$Zlo,3
xgr $Zlo,$tmp
ngr $rem0,$x78
nill $xi,0xf0
sllg $tmp,$Zhi,60
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nhi,$Htbl)
xg $Zhi,0($nhi,$Htbl)
lgr $nhi,$xi
xg $Zhi,0($rem1,$rem_4bit)
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
sllg $tmp,$Zhi,60
brct $cnt,.Lghash_inner
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nlo,$Htbl)
xg $Zhi,0($nlo,$Htbl)
sllg $xi,$Zlo,3
xg $Zhi,0($rem0,$rem_4bit)
xgr $Zlo,$tmp
ngr $xi,$x78
sllg $tmp,$Zhi,60
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nhi,$Htbl)
xg $Zhi,0($nhi,$Htbl)
xgr $Zlo,$tmp
xg $Zhi,0($rem1,$rem_4bit)
lg $tmp,0($xi,$rem_4bit)
la $inp,16($inp)
sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
brctg $len,.Louter
xgr $Zhi,$tmp
stg $Zlo,8+1($Xi)
stg $Zhi,0+1($Xi)
lm${g} %r6,%r14,6*$SIZE_T($sp)
br %r14
.type gcm_ghash_4bit,\@function
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
.align 64
rem_4bit:
.long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
.long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
.long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
.long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
.type rem_4bit,\@object
.size rem_4bit,(.-rem_4bit)
.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,571 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# March 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Performance
# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
# and are expressed in cycles per processed byte, less is better:
#
# gcc 3.3.x cc 5.2 this assembler
#
# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
#
# Here is data collected on UltraSPARC T1 system running Linux:
#
# gcc 4.4.1 this assembler
#
# 32-bit build 566 50 (+1000%)
# 64-bit build 56 50 (+12%)
#
# I don't quite understand why difference between 32-bit and 64-bit
# compiler-generated code is so big. Compilers *were* instructed to
# generate code for UltraSPARC and should have used 64-bit registers
# for Z vector (see C code) even in 32-bit build... Oh well, it only
# means more impressive improvement coefficients for this assembler
# module;-) Loops are aggressively modulo-scheduled in respect to
# references to input data and Z.hi updates to achieve 12 cycles
# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
#
# October 2012
#
# Add VIS3 lookup-table-free implementation using polynomial
# multiplication xmulx[hi] and extended addition addxc[cc]
# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
# saturates at ~15.5x single-process result on 8-core processor,
# or ~20.5GBps per 2.85GHz socket.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$output=shift;
open STDOUT,">$output";
$Zhi="%o0"; # 64-bit values
$Zlo="%o1";
$Thi="%o2";
$Tlo="%o3";
$rem="%o4";
$tmp="%o5";
$nhi="%l0"; # small values and pointers
$nlo="%l1";
$xi0="%l2";
$xi1="%l3";
$rem_4bit="%l4";
$remi="%l5";
$Htblo="%l6";
$cnt="%l7";
$Xi="%i0"; # input argument block
$Htbl="%i1";
$inp="%i2";
$len="%i3";
$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.section ".text",#alloc,#execinstr
.align 64
rem_4bit:
.long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
.long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
.long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
.long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
.type rem_4bit,#object
.size rem_4bit,(.-rem_4bit)
.globl gcm_ghash_4bit
.align 32
gcm_ghash_4bit:
save %sp,-$frame,%sp
ldub [$inp+15],$nlo
ldub [$Xi+15],$xi0
ldub [$Xi+14],$xi1
add $len,$inp,$len
add $Htbl,8,$Htblo
1: call .+8
add %o7,rem_4bit-1b,$rem_4bit
.Louter:
xor $xi0,$nlo,$nlo
and $nlo,0xf0,$nhi
and $nlo,0x0f,$nlo
sll $nlo,4,$nlo
ldx [$Htblo+$nlo],$Zlo
ldx [$Htbl+$nlo],$Zhi
ldub [$inp+14],$nlo
ldx [$Htblo+$nhi],$Tlo
and $Zlo,0xf,$remi
ldx [$Htbl+$nhi],$Thi
sll $remi,3,$remi
ldx [$rem_4bit+$remi],$rem
srlx $Zlo,4,$Zlo
mov 13,$cnt
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $xi1,$nlo,$nlo
and $Zlo,0xf,$remi
and $nlo,0xf0,$nhi
and $nlo,0x0f,$nlo
ba .Lghash_inner
sll $nlo,4,$nlo
.align 32
.Lghash_inner:
ldx [$Htblo+$nlo],$Tlo
sll $remi,3,$remi
xor $Thi,$Zhi,$Zhi
ldx [$Htbl+$nlo],$Thi
srlx $Zlo,4,$Zlo
xor $rem,$Zhi,$Zhi
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
ldub [$inp+$cnt],$nlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
ldub [$Xi+$cnt],$xi1
xor $Thi,$Zhi,$Zhi
and $Zlo,0xf,$remi
ldx [$Htblo+$nhi],$Tlo
sll $remi,3,$remi
xor $rem,$Zhi,$Zhi
ldx [$Htbl+$nhi],$Thi
srlx $Zlo,4,$Zlo
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $xi1,$nlo,$nlo
srlx $Zhi,4,$Zhi
and $nlo,0xf0,$nhi
addcc $cnt,-1,$cnt
xor $Zlo,$tmp,$Zlo
and $nlo,0x0f,$nlo
xor $Tlo,$Zlo,$Zlo
sll $nlo,4,$nlo
blu .Lghash_inner
and $Zlo,0xf,$remi
ldx [$Htblo+$nlo],$Tlo
sll $remi,3,$remi
xor $Thi,$Zhi,$Zhi
ldx [$Htbl+$nlo],$Thi
srlx $Zlo,4,$Zlo
xor $rem,$Zhi,$Zhi
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
add $inp,16,$inp
cmp $inp,$len
be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
and $Zlo,0xf,$remi
ldx [$Htblo+$nhi],$Tlo
sll $remi,3,$remi
xor $rem,$Zhi,$Zhi
ldx [$Htbl+$nhi],$Thi
srlx $Zlo,4,$Zlo
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
ldub [$inp+15],$nlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
stx $Zlo,[$Xi+8]
xor $rem,$Zhi,$Zhi
stx $Zhi,[$Xi]
srl $Zlo,8,$xi1
and $Zlo,0xff,$xi0
ba .Louter
and $xi1,0xff,$xi1
.align 32
.Ldone:
ldx [$Htblo+$nhi],$Tlo
sll $remi,3,$remi
xor $rem,$Zhi,$Zhi
ldx [$Htbl+$nhi],$Thi
srlx $Zlo,4,$Zlo
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
stx $Zlo,[$Xi+8]
xor $rem,$Zhi,$Zhi
stx $Zhi,[$Xi]
ret
restore
.type gcm_ghash_4bit,#function
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
___
undef $inp;
undef $len;
$code.=<<___;
.globl gcm_gmult_4bit
.align 32
gcm_gmult_4bit:
save %sp,-$frame,%sp
ldub [$Xi+15],$nlo
add $Htbl,8,$Htblo
1: call .+8
add %o7,rem_4bit-1b,$rem_4bit
and $nlo,0xf0,$nhi
and $nlo,0x0f,$nlo
sll $nlo,4,$nlo
ldx [$Htblo+$nlo],$Zlo
ldx [$Htbl+$nlo],$Zhi
ldub [$Xi+14],$nlo
ldx [$Htblo+$nhi],$Tlo
and $Zlo,0xf,$remi
ldx [$Htbl+$nhi],$Thi
sll $remi,3,$remi
ldx [$rem_4bit+$remi],$rem
srlx $Zlo,4,$Zlo
mov 13,$cnt
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
and $Zlo,0xf,$remi
and $nlo,0xf0,$nhi
and $nlo,0x0f,$nlo
ba .Lgmult_inner
sll $nlo,4,$nlo
.align 32
.Lgmult_inner:
ldx [$Htblo+$nlo],$Tlo
sll $remi,3,$remi
xor $Thi,$Zhi,$Zhi
ldx [$Htbl+$nlo],$Thi
srlx $Zlo,4,$Zlo
xor $rem,$Zhi,$Zhi
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
ldub [$Xi+$cnt],$nlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
and $Zlo,0xf,$remi
ldx [$Htblo+$nhi],$Tlo
sll $remi,3,$remi
xor $rem,$Zhi,$Zhi
ldx [$Htbl+$nhi],$Thi
srlx $Zlo,4,$Zlo
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
srlx $Zhi,4,$Zhi
and $nlo,0xf0,$nhi
addcc $cnt,-1,$cnt
xor $Zlo,$tmp,$Zlo
and $nlo,0x0f,$nlo
xor $Tlo,$Zlo,$Zlo
sll $nlo,4,$nlo
blu .Lgmult_inner
and $Zlo,0xf,$remi
ldx [$Htblo+$nlo],$Tlo
sll $remi,3,$remi
xor $Thi,$Zhi,$Zhi
ldx [$Htbl+$nlo],$Thi
srlx $Zlo,4,$Zlo
xor $rem,$Zhi,$Zhi
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
and $Zlo,0xf,$remi
ldx [$Htblo+$nhi],$Tlo
sll $remi,3,$remi
xor $rem,$Zhi,$Zhi
ldx [$Htbl+$nhi],$Thi
srlx $Zlo,4,$Zlo
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
stx $Zlo,[$Xi+8]
xor $rem,$Zhi,$Zhi
stx $Zhi,[$Xi]
ret
restore
.type gcm_gmult_4bit,#function
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
___
{{{
# Straightforward 128x128-bit multiplication using Karatsuba algorithm
# followed by pair of 64-bit reductions [with a shortcut in first one,
# which allowed to break dependency between reductions and remove one
# multiplication from critical path]. While it might be suboptimal
# with regard to sheer number of multiplications, other methods [such
# as aggregate reduction] would require more 64-bit registers, which
# we don't have in 32-bit application context.
($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
($shl,$shr)=map("%l$_",(0..7));
# For details regarding "twisted H" see ghash-x86.pl.
$code.=<<___;
.globl gcm_init_vis3
.align 32
gcm_init_vis3:
save %sp,-$frame,%sp
ldx [%i1+0],$Hhi
ldx [%i1+8],$Hlo
mov 0xE1,$Xhi
mov 1,$Xlo
sllx $Xhi,57,$Xhi
srax $Hhi,63,$C0 ! broadcast carry
addcc $Hlo,$Hlo,$Hlo ! H<<=1
addxc $Hhi,$Hhi,$Hhi
and $C0,$Xlo,$Xlo
and $C0,$Xhi,$Xhi
xor $Xlo,$Hlo,$Hlo
xor $Xhi,$Hhi,$Hhi
stx $Hlo,[%i0+8] ! save twisted H
stx $Hhi,[%i0+0]
sethi %hi(0xA0406080),$V
sethi %hi(0x20C0E000),%l0
or $V,%lo(0xA0406080),$V
or %l0,%lo(0x20C0E000),%l0
sllx $V,32,$V
or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
stx $V,[%i0+16]
ret
restore
.type gcm_init_vis3,#function
.size gcm_init_vis3,.-gcm_init_vis3
.globl gcm_gmult_vis3
.align 32
gcm_gmult_vis3:
save %sp,-$frame,%sp
ldx [$Xip+8],$Xlo ! load Xi
ldx [$Xip+0],$Xhi
ldx [$Htable+8],$Hlo ! load twisted H
ldx [$Htable+0],$Hhi
mov 0xE1,%l7
sllx %l7,57,$xE1 ! 57 is not a typo
ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
xmulx $Xlo,$Hlo,$C0
xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
xmulx $C2,$Hhl,$C1
xmulxhi $Xlo,$Hlo,$Xlo
xmulxhi $C2,$Hhl,$C2
xmulxhi $Xhi,$Hhi,$C3
xmulx $Xhi,$Hhi,$Xhi
sll $C0,3,$sqr
srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
xor $C0,$sqr,$sqr
sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
xor $C0,$C1,$C1 ! Karatsuba post-processing
xor $Xlo,$C2,$C2
xor $sqr,$Xlo,$Xlo ! real destination is $C1
xor $C3,$C2,$C2
xor $Xlo,$C1,$C1
xor $Xhi,$C2,$C2
xor $Xhi,$C1,$C1
xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
xor $C0,$C2,$C2
xmulx $C1,$xE1,$C0
xor $C1,$C3,$C3
xmulxhi $C1,$xE1,$C1
xor $Xlo,$C2,$C2
xor $C0,$C2,$C2
xor $C1,$C3,$C3
stx $C2,[$Xip+8] ! save Xi
stx $C3,[$Xip+0]
ret
restore
.type gcm_gmult_vis3,#function
.size gcm_gmult_vis3,.-gcm_gmult_vis3
.globl gcm_ghash_vis3
.align 32
gcm_ghash_vis3:
save %sp,-$frame,%sp
ldx [$Xip+8],$C2 ! load Xi
ldx [$Xip+0],$C3
ldx [$Htable+8],$Hlo ! load twisted H
ldx [$Htable+0],$Hhi
mov 0xE1,%l7
sllx %l7,57,$xE1 ! 57 is not a typo
ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
and $inp,7,$shl
andn $inp,7,$inp
sll $shl,3,$shl
prefetch [$inp+63], 20
sub %g0,$shl,$shr
xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
.Loop:
ldx [$inp+8],$Xlo
brz,pt $shl,1f
ldx [$inp+0],$Xhi
ldx [$inp+16],$C1 ! align data
srlx $Xlo,$shr,$C0
sllx $Xlo,$shl,$Xlo
sllx $Xhi,$shl,$Xhi
srlx $C1,$shr,$C1
or $C0,$Xhi,$Xhi
or $C1,$Xlo,$Xlo
1:
add $inp,16,$inp
sub $len,16,$len
xor $C2,$Xlo,$Xlo
xor $C3,$Xhi,$Xhi
prefetch [$inp+63], 20
xmulx $Xlo,$Hlo,$C0
xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
xmulx $C2,$Hhl,$C1
xmulxhi $Xlo,$Hlo,$Xlo
xmulxhi $C2,$Hhl,$C2
xmulxhi $Xhi,$Hhi,$C3
xmulx $Xhi,$Hhi,$Xhi
sll $C0,3,$sqr
srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
xor $C0,$sqr,$sqr
sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
xor $C0,$C1,$C1 ! Karatsuba post-processing
xor $Xlo,$C2,$C2
xor $sqr,$Xlo,$Xlo ! real destination is $C1
xor $C3,$C2,$C2
xor $Xlo,$C1,$C1
xor $Xhi,$C2,$C2
xor $Xhi,$C1,$C1
xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
xor $C0,$C2,$C2
xmulx $C1,$xE1,$C0
xor $C1,$C3,$C3
xmulxhi $C1,$xE1,$C1
xor $Xlo,$C2,$C2
xor $C0,$C2,$C2
brnz,pt $len,.Loop
xor $C1,$C3,$C3
stx $C2,[$Xip+8] ! save Xi
stx $C3,[$Xip+0]
ret
restore
.type gcm_ghash_vis3,#function
.size gcm_ghash_vis3,.-gcm_ghash_vis3
___
}}}
$code.=<<___;
.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis3 {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my ($ref,$opf);
my %visopf = ( "addxc" => 0x011,
"addxccc" => 0x013,
"xmulx" => 0x115,
"xmulxhi" => 0x116 );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%([goli])([0-9])/);
$_=$bias{$1}+$2;
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unvis3($1,$2,$3,$4)
/ge;
print $_,"\n";
}
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,234 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# GHASH for for PowerISA v2.07.
#
# July 2014
#
# Accurate performance measurements are problematic, because it's
# always virtualized setup with possibly throttled processor.
# Relative comparison is therefore more informative. This initial
# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
# faster than "4-bit" integer-only compiler-generated 64-bit code.
# "Initial version" means that there is room for futher improvement.
$flavour=shift;
$output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$POP="ld";
$PUSH="std";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$POP="lwz";
$PUSH="stw";
} else { die "nonsense $flavour"; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
my $vrsave="r12";
$code=<<___;
.machine "any"
.text
.globl .gcm_init_p8
.align 5
.gcm_init_p8:
lis r0,0xfff0
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $H,0,r4 # load H
vspltisb $xC2,-16 # 0xf0
vspltisb $t0,1 # one
vaddubm $xC2,$xC2,$xC2 # 0xe0
vxor $zero,$zero,$zero
vor $xC2,$xC2,$t0 # 0xe1
vsldoi $xC2,$xC2,$zero,15 # 0xe1...
vsldoi $t1,$zero,$t0,1 # ...1
vaddubm $xC2,$xC2,$xC2 # 0xc2...
vspltisb $t2,7
vor $xC2,$xC2,$t1 # 0xc2....01
vspltb $t1,$H,0 # most significant byte
vsl $H,$H,$t0 # H<<=1
vsrab $t1,$t1,$t2 # broadcast carry bit
vand $t1,$t1,$xC2
vxor $H,$H,$t1 # twisted H
vsldoi $H,$H,$H,8 # twist even more ...
vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
vsldoi $Hl,$zero,$H,8 # ... and split
vsldoi $Hh,$H,$zero,8
stvx_u $xC2,0,r3 # save pre-computed table
stvx_u $Hl,r8,r3
stvx_u $H, r9,r3
stvx_u $Hh,r10,r3
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .gcm_init_p8,.-.gcm_init_p8
.globl .gcm_gmult_p8
.align 5
.gcm_gmult_p8:
lis r0,0xfff8
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $IN,0,$Xip # load Xi
lvx_u $Hl,r8,$Htbl # load pre-computed table
le?lvsl $lemask,r0,r0
lvx_u $H, r9,$Htbl
le?vspltisb $t0,0x07
lvx_u $Hh,r10,$Htbl
le?vxor $lemask,$lemask,$t0
lvx_u $xC2,0,$Htbl
le?vperm $IN,$IN,$IN,$lemask
vxor $zero,$zero,$zero
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
vpmsumd $t2,$Xl,$xC2 # 1st phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
vxor $Xl,$Xl,$t1
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .gcm_gmult_p8,.-.gcm_gmult_p8
.globl .gcm_ghash_p8
.align 5
.gcm_ghash_p8:
lis r0,0xfff8
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $Xl,0,$Xip # load Xi
lvx_u $Hl,r8,$Htbl # load pre-computed table
le?lvsl $lemask,r0,r0
lvx_u $H, r9,$Htbl
le?vspltisb $t0,0x07
lvx_u $Hh,r10,$Htbl
le?vxor $lemask,$lemask,$t0
lvx_u $xC2,0,$Htbl
le?vperm $Xl,$Xl,$Xl,$lemask
vxor $zero,$zero,$zero
lvx_u $IN,0,$inp
addi $inp,$inp,16
subi $len,$len,16
le?vperm $IN,$IN,$IN,$lemask
vxor $IN,$IN,$Xl
b Loop
.align 5
Loop:
subic $len,$len,16
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
subfe. r0,r0,r0 # borrow?-1:0
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
and r0,r0,$len
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
add $inp,$inp,r0
vpmsumd $t2,$Xl,$xC2 # 1st phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
lvx_u $IN,0,$inp
addi $inp,$inp,16
vsldoi $t1,$Xl,$Xl,8 # 2nd phase
vpmsumd $Xl,$Xl,$xC2
le?vperm $IN,$IN,$IN,$lemask
vxor $t1,$t1,$Xh
vxor $IN,$IN,$t1
vxor $IN,$IN,$Xl
beq Loop # did $len-=16 borrow?
vxor $Xl,$Xl,$t1
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
.size .gcm_ghash_p8,.-.gcm_ghash_p8
.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
if ($flavour =~ /le$/o) { # little-endian
s/le\?//o or
s/be\?/#be#/o;
} else {
s/le\?/#le#/o or
s/be\?//o;
}
print $_,"\n";
}
close STDOUT; # enforce flush

View File

@@ -0,0 +1,409 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
#
# June 2014
#
# Initial version was developed in tight cooperation with Ard
# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
# other assembly modules. Just like aesv8-armx.pl this module
# supports both AArch32 and AArch64 execution modes.
#
# July 2014
#
# Implement 2x aggregated reduction [see ghash-x86.pl for background
# information].
#
# Current performance in cycles per processed byte:
#
# PMULL[2] 32-bit NEON(*)
# Apple A7 0.92 5.62
# Cortex-A53 1.01 8.39
# Cortex-A57 1.17 7.61
#
# (*) presented for reference/comparison purposes;
$flavour = shift;
open STDOUT,">".shift;
$Xi="x0"; # argument block
$Htbl="x1";
$inp="x2";
$len="x3";
$inc="x12";
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
$code=<<___;
#include "arm_arch.h"
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
################################################################################
# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
#
# input: 128-bit H - secret parameter E(K,0^128)
# output: precomputed table filled with degrees of twisted H;
# H is twisted to handle reverse bitness of GHASH;
# only few of 16 slots of Htable[16] are used;
# data is opaque to outside world (which allows to
# optimize the code independently);
#
$code.=<<___;
.global gcm_init_v8
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
vld1.64 {$t1},[x1] @ load input H
vmov.i8 $xC2,#0xe1
vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
vext.8 $IN,$t1,$t1,#8
vshr.u64 $t2,$xC2,#63
vdup.32 $t1,${t1}[1]
vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
vshr.u64 $t2,$IN,#63
vshr.s32 $t1,$t1,#31 @ broadcast carry bit
vand $t2,$t2,$t0
vshl.i64 $IN,$IN,#1
vext.8 $t2,$t2,$t2,#8
vand $t0,$t0,$t1
vorr $IN,$IN,$t2 @ H<<<=1
veor $H,$IN,$t0 @ twisted H
vst1.64 {$H},[x0],#16 @ store Htable[0]
@ calculate H^2
vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
vpmull.p64 $Xl,$H,$H
veor $t0,$t0,$H
vpmull2.p64 $Xh,$H,$H
vpmull.p64 $Xm,$t0,$t0
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $H2,$Xl,$t2
vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
veor $t1,$t1,$H2
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2]
ret
.size gcm_init_v8,.-gcm_init_v8
___
################################################################################
# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
#
# input: Xi - current hash value;
# Htable - table precomputed in gcm_init_v8;
# output: Xi - next hash value Xi;
#
$code.=<<___;
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
vld1.64 {$t1},[$Xi] @ load Xi
vmov.i8 $xC2,#0xe1
vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
vshl.u64 $xC2,$xC2,#57
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vext.8 $IN,$t1,$t1,#8
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vext.8 $Xl,$Xl,$Xl,#8
vst1.64 {$Xl},[$Xi] @ write out Xi
ret
.size gcm_gmult_v8,.-gcm_gmult_v8
___
################################################################################
# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
#
# input: table precomputed in gcm_init_v8;
# current hash value Xi;
# pointer to input data;
# length of input data in bytes, but divisible by block size;
# output: next hash value Xi;
#
$code.=<<___;
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
___
$code.=<<___ if ($flavour !~ /64/);
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
___
$code.=<<___;
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
@ "[rotated]" means that
@ loaded value would have
@ to be rotated in order to
@ make it appear as in
@ alorithm specification
subs $len,$len,#32 @ see if $len is 32 or larger
mov $inc,#16 @ $inc is used as post-
@ increment for input pointer;
@ as loop is modulo-scheduled
@ $inc is zeroed just in time
@ to preclude oversteping
@ inp[len], which means that
@ last block[s] are actually
@ loaded twice, but last
@ copy is not processed
vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
vmov.i8 $xC2,#0xe1
vld1.64 {$H2},[$Htbl]
cclr $inc,eq @ is it time to zero $inc?
vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
#ifndef __ARMEB__
vrev64.8 $t0,$t0
vrev64.8 $Xl,$Xl
#endif
vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
b.lo .Lodd_tail_v8 @ $len was less than 32
___
{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
$code.=<<___;
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vext.8 $In,$t1,$t1,#8
veor $IN,$IN,$Xl @ I[i]^=Xi
vpmull.p64 $Xln,$H,$In @ H·Ii+1
veor $t1,$t1,$In @ Karatsuba pre-processing
vpmull2.p64 $Xhn,$H,$In
b .Loop_mod2x_v8
.align 4
.Loop_mod2x_v8:
vext.8 $t2,$IN,$IN,#8
subs $len,$len,#32 @ is there more data?
vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
cclr $inc,lo @ is it time to zero $inc?
vpmull.p64 $Xmn,$Hhl,$t1
veor $t2,$t2,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
veor $Xl,$Xl,$Xln @ accumulate
vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
veor $Xh,$Xh,$Xhn
cclr $inc,eq @ is it time to zero $inc?
veor $Xm,$Xm,$Xmn
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
#ifndef __ARMEB__
vrev64.8 $t0,$t0
#endif
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vext.8 $In,$t1,$t1,#8
vext.8 $IN,$t0,$t0,#8
veor $Xl,$Xm,$t2
vpmull.p64 $Xln,$H,$In @ H·Ii+1
veor $IN,$IN,$Xh @ accumulate $IN early
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $IN,$IN,$t2
veor $t1,$t1,$In @ Karatsuba pre-processing
veor $IN,$IN,$Xl
vpmull2.p64 $Xhn,$H,$In
b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
veor $Xh,$Xh,$t2
vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
adds $len,$len,#32 @ re-construct $len
veor $Xl,$Xl,$Xh @ re-construct $Xl
b.eq .Ldone_v8 @ is $len zero?
___
}
$code.=<<___;
.Lodd_tail_v8:
vext.8 $t2,$Xl,$Xl,#8
veor $IN,$IN,$Xl @ inp^=Xi
veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
.Ldone_v8:
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vext.8 $Xl,$Xl,$Xl,#8
vst1.64 {$Xl},[$Xi] @ write out Xi
___
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15} @ 32-bit ABI says so
___
$code.=<<___;
ret
.size gcm_ghash_v8,.-gcm_ghash_v8
___
}
$code.=<<___;
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
if ($flavour =~ /64/) { ######## 64-bit code
sub unvmov {
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vmov\s+(.*)/unvmov($1)/geo or
s/vext\.8/ext/o or
s/vshr\.s/sshr\.s/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
# fix up remainig legacy suffixes
s/\.[ui]?8(\s)/$1/o;
s/\.[uis]?32//o and s/\.16b/\.4s/go;
m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
s/\.[uisp]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
print $_,"\n";
}
} else { ######## 32-bit code
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvpmullp64 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
$word |= 0x00010001 if ($mnemonic =~ "2");
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remainig new-style suffixes
s/\],#[0-9]+/]!/o;
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
print $_,"\n";
}
}
close STDOUT; # enforce flush