Initial Commit

This commit is contained in:
root
2017-02-25 23:55:24 +01:00
commit 1fe2e8ab62
4868 changed files with 1487355 additions and 0 deletions

View File

@@ -0,0 +1,209 @@
#!/usr/local/bin/perl
#
# The inner loop instruction sequence and the IP/FP modifications are from
# Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk>
# I've added the stuff needed for crypt() but I've not worried about making
# things perfect.
#
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"crypt586.pl");
$L="edi";
$R="esi";
&external_label("DES_SPtrans");
&fcrypt_body("fcrypt_body");
&asm_finish();
sub fcrypt_body
{
local($name,$do_ip)=@_;
&function_begin($name);
&comment("");
&comment("Load the 2 words");
$trans="ebp";
&xor( $L, $L);
&xor( $R, $R);
# PIC-ification:-)
&picmeup("edx","DES_SPtrans");
#if ($cpp) { &picmeup("edx","DES_SPtrans"); }
#else { &lea("edx",&DWP("DES_SPtrans")); }
&push("edx"); # becomes &swtmp(1)
#
&mov($trans,&wparam(1)); # reloaded with DES_SPtrans in D_ENCRYPT
&push(&DWC(25)); # add a variable
&set_label("start");
for ($i=0; $i<16; $i+=2)
{
&comment("");
&comment("Round $i");
&D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx");
&comment("");
&comment("Round ".sprintf("%d",$i+1));
&D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx");
}
&mov("ebx", &swtmp(0));
&mov("eax", $L);
&dec("ebx");
&mov($L, $R);
&mov($R, "eax");
&mov(&swtmp(0), "ebx");
&jnz(&label("start"));
&comment("");
&comment("FP");
&mov("edx",&wparam(0));
&FP_new($R,$L,"eax",3);
&mov(&DWP(0,"edx","",0),"eax");
&mov(&DWP(4,"edx","",0),$L);
&add("esp",8); # remove variables
&function_end($name);
}
sub D_ENCRYPT
{
local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t)=@_;
&mov( $u, &wparam(2)); # 2
&mov( $t, $R);
&shr( $t, 16); # 1
&mov( $tmp2, &wparam(3)); # 2
&xor( $t, $R); # 1
&and( $u, $t); # 2
&and( $t, $tmp2); # 2
&mov( $tmp1, $u);
&shl( $tmp1, 16); # 1
&mov( $tmp2, $t);
&shl( $tmp2, 16); # 1
&xor( $u, $tmp1); # 2
&xor( $t, $tmp2); # 2
&mov( $tmp1, &DWP(&n2a($S*4),$trans,"",0)); # 2
&xor( $u, $tmp1);
&mov( $tmp2, &DWP(&n2a(($S+1)*4),$trans,"",0)); # 2
&xor( $u, $R);
&xor( $t, $R);
&xor( $t, $tmp2);
&and( $u, "0xfcfcfcfc" ); # 2
&xor( $tmp1, $tmp1); # 1
&and( $t, "0xcfcfcfcf" ); # 2
&xor( $tmp2, $tmp2);
&movb( &LB($tmp1), &LB($u) );
&movb( &LB($tmp2), &HB($u) );
&rotr( $t, 4 );
&mov( $trans, &swtmp(1));
&xor( $L, &DWP(" ",$trans,$tmp1,0));
&movb( &LB($tmp1), &LB($t) );
&xor( $L, &DWP("0x200",$trans,$tmp2,0));
&movb( &LB($tmp2), &HB($t) );
&shr( $u, 16);
&xor( $L, &DWP("0x100",$trans,$tmp1,0));
&movb( &LB($tmp1), &HB($u) );
&shr( $t, 16);
&xor( $L, &DWP("0x300",$trans,$tmp2,0));
&movb( &LB($tmp2), &HB($t) );
&and( $u, "0xff" );
&and( $t, "0xff" );
&mov( $tmp1, &DWP("0x600",$trans,$tmp1,0));
&xor( $L, $tmp1);
&mov( $tmp1, &DWP("0x700",$trans,$tmp2,0));
&xor( $L, $tmp1);
&mov( $tmp1, &DWP("0x400",$trans,$u,0));
&xor( $L, $tmp1);
&mov( $tmp1, &DWP("0x500",$trans,$t,0));
&xor( $L, $tmp1);
&mov( $trans, &wparam(1));
}
sub n2a
{
sprintf("%d",$_[0]);
}
# now has a side affect of rotating $a by $shift
sub R_PERM_OP
{
local($a,$b,$tt,$shift,$mask,$last)=@_;
&rotl( $a, $shift ) if ($shift != 0);
&mov( $tt, $a );
&xor( $a, $b );
&and( $a, $mask );
if ($notlast eq $b)
{
&xor( $b, $a );
&xor( $tt, $a );
}
else
{
&xor( $tt, $a );
&xor( $b, $a );
}
&comment("");
}
sub IP_new
{
local($l,$r,$tt,$lr)=@_;
&R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l);
&R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l);
&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
if ($lr != 3)
{
if (($lr-3) < 0)
{ &rotr($tt, 3-$lr); }
else { &rotl($tt, $lr-3); }
}
if ($lr != 2)
{
if (($lr-2) < 0)
{ &rotr($r, 2-$lr); }
else { &rotl($r, $lr-2); }
}
}
sub FP_new
{
local($l,$r,$tt,$lr)=@_;
if ($lr != 2)
{
if (($lr-2) < 0)
{ &rotl($r, 2-$lr); }
else { &rotr($r, $lr-2); }
}
if ($lr != 3)
{
if (($lr-3) < 0)
{ &rotl($l, 3-$lr); }
else { &rotr($l, $lr-3); }
}
&R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r);
&R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r);
&R_PERM_OP($l,$r,$tt,10,"0x33333333",$l);
&R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l);
&R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r);
&rotr($tt , 4);
}

View File

@@ -0,0 +1,455 @@
#!/usr/local/bin/perl
#
# The inner loop instruction sequence and the IP/FP modifications are from
# Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk>
#
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
require "cbc.pl";
require "desboth.pl";
# base code is in microsft
# op dest, source
# format.
#
&asm_init($ARGV[0],"des-586.pl");
$L="edi";
$R="esi";
$trans="ebp";
$small_footprint=1 if (grep(/\-DOPENSSL_SMALL_FOOTPRINT/,@ARGV));
# one can discuss setting this variable to 1 unconditionally, as
# the folded loop is only 3% slower than unrolled, but >7 times smaller
&public_label("DES_SPtrans");
&static_label("des_sptrans");
&DES_encrypt_internal();
&DES_decrypt_internal();
&DES_encrypt("DES_encrypt1",1);
&DES_encrypt("DES_encrypt2",0);
&DES_encrypt3("DES_encrypt3",1);
&DES_encrypt3("DES_decrypt3",0);
&cbc("DES_ncbc_encrypt","DES_encrypt1","DES_encrypt1",0,4,5,3,5,-1);
&cbc("DES_ede3_cbc_encrypt","DES_encrypt3","DES_decrypt3",0,6,7,3,4,5);
&DES_SPtrans();
&asm_finish();
sub DES_encrypt_internal()
{
&function_begin_B("_x86_DES_encrypt");
if ($small_footprint)
{
&lea("edx",&DWP(128,"ecx"));
&push("edx");
&push("ecx");
&set_label("eloop");
&D_ENCRYPT(0,$L,$R,0,$trans,"eax","ebx","ecx","edx",&swtmp(0));
&comment("");
&D_ENCRYPT(1,$R,$L,2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
&comment("");
&add("ecx",16);
&cmp("ecx",&swtmp(1));
&mov(&swtmp(0),"ecx");
&jb(&label("eloop"));
&add("esp",8);
}
else
{
&push("ecx");
for ($i=0; $i<16; $i+=2)
{
&comment("Round $i");
&D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
&comment("Round ".sprintf("%d",$i+1));
&D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
}
&add("esp",4);
}
&ret();
&function_end_B("_x86_DES_encrypt");
}
sub DES_decrypt_internal()
{
&function_begin_B("_x86_DES_decrypt");
if ($small_footprint)
{
&push("ecx");
&lea("ecx",&DWP(128,"ecx"));
&push("ecx");
&set_label("dloop");
&D_ENCRYPT(0,$L,$R,-2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
&comment("");
&D_ENCRYPT(1,$R,$L,-4,$trans,"eax","ebx","ecx","edx",&swtmp(0));
&comment("");
&sub("ecx",16);
&cmp("ecx",&swtmp(1));
&mov(&swtmp(0),"ecx");
&ja(&label("dloop"));
&add("esp",8);
}
else
{
&push("ecx");
for ($i=15; $i>0; $i-=2)
{
&comment("Round $i");
&D_ENCRYPT(15-$i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
&comment("Round ".sprintf("%d",$i-1));
&D_ENCRYPT(15-$i+1,$R,$L,($i-1)*2,$trans,"eax","ebx","ecx","edx",&swtmp(0));
}
&add("esp",4);
}
&ret();
&function_end_B("_x86_DES_decrypt");
}
sub DES_encrypt
{
local($name,$do_ip)=@_;
&function_begin_B($name);
&push("esi");
&push("edi");
&comment("");
&comment("Load the 2 words");
if ($do_ip)
{
&mov($R,&wparam(0));
&xor( "ecx", "ecx" );
&push("ebx");
&push("ebp");
&mov("eax",&DWP(0,$R,"",0));
&mov("ebx",&wparam(2)); # get encrypt flag
&mov($L,&DWP(4,$R,"",0));
&comment("");
&comment("IP");
&IP_new("eax",$L,$R,3);
}
else
{
&mov("eax",&wparam(0));
&xor( "ecx", "ecx" );
&push("ebx");
&push("ebp");
&mov($R,&DWP(0,"eax","",0));
&mov("ebx",&wparam(2)); # get encrypt flag
&rotl($R,3);
&mov($L,&DWP(4,"eax","",0));
&rotl($L,3);
}
# PIC-ification:-)
&call (&label("pic_point"));
&set_label("pic_point");
&blindpop($trans);
&lea ($trans,&DWP(&label("des_sptrans")."-".&label("pic_point"),$trans));
&mov( "ecx", &wparam(1) );
&cmp("ebx","0");
&je(&label("decrypt"));
&call("_x86_DES_encrypt");
&jmp(&label("done"));
&set_label("decrypt");
&call("_x86_DES_decrypt");
&set_label("done");
if ($do_ip)
{
&comment("");
&comment("FP");
&mov("edx",&wparam(0));
&FP_new($L,$R,"eax",3);
&mov(&DWP(0,"edx","",0),"eax");
&mov(&DWP(4,"edx","",0),$R);
}
else
{
&comment("");
&comment("Fixup");
&rotr($L,3); # r
&mov("eax",&wparam(0));
&rotr($R,3); # l
&mov(&DWP(0,"eax","",0),$L);
&mov(&DWP(4,"eax","",0),$R);
}
&pop("ebp");
&pop("ebx");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
sub D_ENCRYPT
{
local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t,$wp1)=@_;
&mov( $u, &DWP(&n2a($S*4),$tmp2,"",0));
&xor( $tmp1, $tmp1);
&mov( $t, &DWP(&n2a(($S+1)*4),$tmp2,"",0));
&xor( $u, $R);
&xor( $tmp2, $tmp2);
&xor( $t, $R);
&and( $u, "0xfcfcfcfc" );
&and( $t, "0xcfcfcfcf" );
&movb( &LB($tmp1), &LB($u) );
&movb( &LB($tmp2), &HB($u) );
&rotr( $t, 4 );
&xor( $L, &DWP(" ",$trans,$tmp1,0));
&movb( &LB($tmp1), &LB($t) );
&xor( $L, &DWP("0x200",$trans,$tmp2,0));
&movb( &LB($tmp2), &HB($t) );
&shr( $u, 16);
&xor( $L, &DWP("0x100",$trans,$tmp1,0));
&movb( &LB($tmp1), &HB($u) );
&shr( $t, 16);
&xor( $L, &DWP("0x300",$trans,$tmp2,0));
&movb( &LB($tmp2), &HB($t) );
&and( $u, "0xff" );
&and( $t, "0xff" );
&xor( $L, &DWP("0x600",$trans,$tmp1,0));
&xor( $L, &DWP("0x700",$trans,$tmp2,0));
&mov( $tmp2, $wp1 );
&xor( $L, &DWP("0x400",$trans,$u,0));
&xor( $L, &DWP("0x500",$trans,$t,0));
}
sub n2a
{
sprintf("%d",$_[0]);
}
# now has a side affect of rotating $a by $shift
sub R_PERM_OP
{
local($a,$b,$tt,$shift,$mask,$last)=@_;
&rotl( $a, $shift ) if ($shift != 0);
&mov( $tt, $a );
&xor( $a, $b );
&and( $a, $mask );
# This can never succeed, and besides it is difficult to see what the
# idea was - Ben 13 Feb 99
if (!$last eq $b)
{
&xor( $b, $a );
&xor( $tt, $a );
}
else
{
&xor( $tt, $a );
&xor( $b, $a );
}
&comment("");
}
sub IP_new
{
local($l,$r,$tt,$lr)=@_;
&R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l);
&R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l);
&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
if ($lr != 3)
{
if (($lr-3) < 0)
{ &rotr($tt, 3-$lr); }
else { &rotl($tt, $lr-3); }
}
if ($lr != 2)
{
if (($lr-2) < 0)
{ &rotr($r, 2-$lr); }
else { &rotl($r, $lr-2); }
}
}
sub FP_new
{
local($l,$r,$tt,$lr)=@_;
if ($lr != 2)
{
if (($lr-2) < 0)
{ &rotl($r, 2-$lr); }
else { &rotr($r, $lr-2); }
}
if ($lr != 3)
{
if (($lr-3) < 0)
{ &rotl($l, 3-$lr); }
else { &rotr($l, $lr-3); }
}
&R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r);
&R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r);
&R_PERM_OP($l,$r,$tt,10,"0x33333333",$l);
&R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l);
&R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r);
&rotr($tt , 4);
}
sub DES_SPtrans
{
&set_label("DES_SPtrans",64);
&set_label("des_sptrans");
&data_word(0x02080800, 0x00080000, 0x02000002, 0x02080802);
&data_word(0x02000000, 0x00080802, 0x00080002, 0x02000002);
&data_word(0x00080802, 0x02080800, 0x02080000, 0x00000802);
&data_word(0x02000802, 0x02000000, 0x00000000, 0x00080002);
&data_word(0x00080000, 0x00000002, 0x02000800, 0x00080800);
&data_word(0x02080802, 0x02080000, 0x00000802, 0x02000800);
&data_word(0x00000002, 0x00000800, 0x00080800, 0x02080002);
&data_word(0x00000800, 0x02000802, 0x02080002, 0x00000000);
&data_word(0x00000000, 0x02080802, 0x02000800, 0x00080002);
&data_word(0x02080800, 0x00080000, 0x00000802, 0x02000800);
&data_word(0x02080002, 0x00000800, 0x00080800, 0x02000002);
&data_word(0x00080802, 0x00000002, 0x02000002, 0x02080000);
&data_word(0x02080802, 0x00080800, 0x02080000, 0x02000802);
&data_word(0x02000000, 0x00000802, 0x00080002, 0x00000000);
&data_word(0x00080000, 0x02000000, 0x02000802, 0x02080800);
&data_word(0x00000002, 0x02080002, 0x00000800, 0x00080802);
# nibble 1
&data_word(0x40108010, 0x00000000, 0x00108000, 0x40100000);
&data_word(0x40000010, 0x00008010, 0x40008000, 0x00108000);
&data_word(0x00008000, 0x40100010, 0x00000010, 0x40008000);
&data_word(0x00100010, 0x40108000, 0x40100000, 0x00000010);
&data_word(0x00100000, 0x40008010, 0x40100010, 0x00008000);
&data_word(0x00108010, 0x40000000, 0x00000000, 0x00100010);
&data_word(0x40008010, 0x00108010, 0x40108000, 0x40000010);
&data_word(0x40000000, 0x00100000, 0x00008010, 0x40108010);
&data_word(0x00100010, 0x40108000, 0x40008000, 0x00108010);
&data_word(0x40108010, 0x00100010, 0x40000010, 0x00000000);
&data_word(0x40000000, 0x00008010, 0x00100000, 0x40100010);
&data_word(0x00008000, 0x40000000, 0x00108010, 0x40008010);
&data_word(0x40108000, 0x00008000, 0x00000000, 0x40000010);
&data_word(0x00000010, 0x40108010, 0x00108000, 0x40100000);
&data_word(0x40100010, 0x00100000, 0x00008010, 0x40008000);
&data_word(0x40008010, 0x00000010, 0x40100000, 0x00108000);
# nibble 2
&data_word(0x04000001, 0x04040100, 0x00000100, 0x04000101);
&data_word(0x00040001, 0x04000000, 0x04000101, 0x00040100);
&data_word(0x04000100, 0x00040000, 0x04040000, 0x00000001);
&data_word(0x04040101, 0x00000101, 0x00000001, 0x04040001);
&data_word(0x00000000, 0x00040001, 0x04040100, 0x00000100);
&data_word(0x00000101, 0x04040101, 0x00040000, 0x04000001);
&data_word(0x04040001, 0x04000100, 0x00040101, 0x04040000);
&data_word(0x00040100, 0x00000000, 0x04000000, 0x00040101);
&data_word(0x04040100, 0x00000100, 0x00000001, 0x00040000);
&data_word(0x00000101, 0x00040001, 0x04040000, 0x04000101);
&data_word(0x00000000, 0x04040100, 0x00040100, 0x04040001);
&data_word(0x00040001, 0x04000000, 0x04040101, 0x00000001);
&data_word(0x00040101, 0x04000001, 0x04000000, 0x04040101);
&data_word(0x00040000, 0x04000100, 0x04000101, 0x00040100);
&data_word(0x04000100, 0x00000000, 0x04040001, 0x00000101);
&data_word(0x04000001, 0x00040101, 0x00000100, 0x04040000);
# nibble 3
&data_word(0x00401008, 0x10001000, 0x00000008, 0x10401008);
&data_word(0x00000000, 0x10400000, 0x10001008, 0x00400008);
&data_word(0x10401000, 0x10000008, 0x10000000, 0x00001008);
&data_word(0x10000008, 0x00401008, 0x00400000, 0x10000000);
&data_word(0x10400008, 0x00401000, 0x00001000, 0x00000008);
&data_word(0x00401000, 0x10001008, 0x10400000, 0x00001000);
&data_word(0x00001008, 0x00000000, 0x00400008, 0x10401000);
&data_word(0x10001000, 0x10400008, 0x10401008, 0x00400000);
&data_word(0x10400008, 0x00001008, 0x00400000, 0x10000008);
&data_word(0x00401000, 0x10001000, 0x00000008, 0x10400000);
&data_word(0x10001008, 0x00000000, 0x00001000, 0x00400008);
&data_word(0x00000000, 0x10400008, 0x10401000, 0x00001000);
&data_word(0x10000000, 0x10401008, 0x00401008, 0x00400000);
&data_word(0x10401008, 0x00000008, 0x10001000, 0x00401008);
&data_word(0x00400008, 0x00401000, 0x10400000, 0x10001008);
&data_word(0x00001008, 0x10000000, 0x10000008, 0x10401000);
# nibble 4
&data_word(0x08000000, 0x00010000, 0x00000400, 0x08010420);
&data_word(0x08010020, 0x08000400, 0x00010420, 0x08010000);
&data_word(0x00010000, 0x00000020, 0x08000020, 0x00010400);
&data_word(0x08000420, 0x08010020, 0x08010400, 0x00000000);
&data_word(0x00010400, 0x08000000, 0x00010020, 0x00000420);
&data_word(0x08000400, 0x00010420, 0x00000000, 0x08000020);
&data_word(0x00000020, 0x08000420, 0x08010420, 0x00010020);
&data_word(0x08010000, 0x00000400, 0x00000420, 0x08010400);
&data_word(0x08010400, 0x08000420, 0x00010020, 0x08010000);
&data_word(0x00010000, 0x00000020, 0x08000020, 0x08000400);
&data_word(0x08000000, 0x00010400, 0x08010420, 0x00000000);
&data_word(0x00010420, 0x08000000, 0x00000400, 0x00010020);
&data_word(0x08000420, 0x00000400, 0x00000000, 0x08010420);
&data_word(0x08010020, 0x08010400, 0x00000420, 0x00010000);
&data_word(0x00010400, 0x08010020, 0x08000400, 0x00000420);
&data_word(0x00000020, 0x00010420, 0x08010000, 0x08000020);
# nibble 5
&data_word(0x80000040, 0x00200040, 0x00000000, 0x80202000);
&data_word(0x00200040, 0x00002000, 0x80002040, 0x00200000);
&data_word(0x00002040, 0x80202040, 0x00202000, 0x80000000);
&data_word(0x80002000, 0x80000040, 0x80200000, 0x00202040);
&data_word(0x00200000, 0x80002040, 0x80200040, 0x00000000);
&data_word(0x00002000, 0x00000040, 0x80202000, 0x80200040);
&data_word(0x80202040, 0x80200000, 0x80000000, 0x00002040);
&data_word(0x00000040, 0x00202000, 0x00202040, 0x80002000);
&data_word(0x00002040, 0x80000000, 0x80002000, 0x00202040);
&data_word(0x80202000, 0x00200040, 0x00000000, 0x80002000);
&data_word(0x80000000, 0x00002000, 0x80200040, 0x00200000);
&data_word(0x00200040, 0x80202040, 0x00202000, 0x00000040);
&data_word(0x80202040, 0x00202000, 0x00200000, 0x80002040);
&data_word(0x80000040, 0x80200000, 0x00202040, 0x00000000);
&data_word(0x00002000, 0x80000040, 0x80002040, 0x80202000);
&data_word(0x80200000, 0x00002040, 0x00000040, 0x80200040);
# nibble 6
&data_word(0x00004000, 0x00000200, 0x01000200, 0x01000004);
&data_word(0x01004204, 0x00004004, 0x00004200, 0x00000000);
&data_word(0x01000000, 0x01000204, 0x00000204, 0x01004000);
&data_word(0x00000004, 0x01004200, 0x01004000, 0x00000204);
&data_word(0x01000204, 0x00004000, 0x00004004, 0x01004204);
&data_word(0x00000000, 0x01000200, 0x01000004, 0x00004200);
&data_word(0x01004004, 0x00004204, 0x01004200, 0x00000004);
&data_word(0x00004204, 0x01004004, 0x00000200, 0x01000000);
&data_word(0x00004204, 0x01004000, 0x01004004, 0x00000204);
&data_word(0x00004000, 0x00000200, 0x01000000, 0x01004004);
&data_word(0x01000204, 0x00004204, 0x00004200, 0x00000000);
&data_word(0x00000200, 0x01000004, 0x00000004, 0x01000200);
&data_word(0x00000000, 0x01000204, 0x01000200, 0x00004200);
&data_word(0x00000204, 0x00004000, 0x01004204, 0x01000000);
&data_word(0x01004200, 0x00000004, 0x00004004, 0x01004204);
&data_word(0x01000004, 0x01004200, 0x01004000, 0x00004004);
# nibble 7
&data_word(0x20800080, 0x20820000, 0x00020080, 0x00000000);
&data_word(0x20020000, 0x00800080, 0x20800000, 0x20820080);
&data_word(0x00000080, 0x20000000, 0x00820000, 0x00020080);
&data_word(0x00820080, 0x20020080, 0x20000080, 0x20800000);
&data_word(0x00020000, 0x00820080, 0x00800080, 0x20020000);
&data_word(0x20820080, 0x20000080, 0x00000000, 0x00820000);
&data_word(0x20000000, 0x00800000, 0x20020080, 0x20800080);
&data_word(0x00800000, 0x00020000, 0x20820000, 0x00000080);
&data_word(0x00800000, 0x00020000, 0x20000080, 0x20820080);
&data_word(0x00020080, 0x20000000, 0x00000000, 0x00820000);
&data_word(0x20800080, 0x20020080, 0x20020000, 0x00800080);
&data_word(0x20820000, 0x00000080, 0x00800080, 0x20020000);
&data_word(0x20820080, 0x00800000, 0x20800000, 0x20000080);
&data_word(0x00820000, 0x00020080, 0x20020080, 0x20800000);
&data_word(0x00000080, 0x20820000, 0x00820080, 0x00000000);
&data_word(0x20000000, 0x20800080, 0x00020000, 0x00820080);
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,79 @@
#!/usr/local/bin/perl
$L="edi";
$R="esi";
sub DES_encrypt3
{
local($name,$enc)=@_;
&function_begin_B($name,"");
&push("ebx");
&mov("ebx",&wparam(0));
&push("ebp");
&push("esi");
&push("edi");
&comment("");
&comment("Load the data words");
&mov($L,&DWP(0,"ebx","",0));
&mov($R,&DWP(4,"ebx","",0));
&stack_push(3);
&comment("");
&comment("IP");
&IP_new($L,$R,"edx",0);
# put them back
if ($enc)
{
&mov(&DWP(4,"ebx","",0),$R);
&mov("eax",&wparam(1));
&mov(&DWP(0,"ebx","",0),"edx");
&mov("edi",&wparam(2));
&mov("esi",&wparam(3));
}
else
{
&mov(&DWP(4,"ebx","",0),$R);
&mov("esi",&wparam(1));
&mov(&DWP(0,"ebx","",0),"edx");
&mov("edi",&wparam(2));
&mov("eax",&wparam(3));
}
&mov(&swtmp(2), (DWC(($enc)?"1":"0")));
&mov(&swtmp(1), "eax");
&mov(&swtmp(0), "ebx");
&call("DES_encrypt2");
&mov(&swtmp(2), (DWC(($enc)?"0":"1")));
&mov(&swtmp(1), "edi");
&mov(&swtmp(0), "ebx");
&call("DES_encrypt2");
&mov(&swtmp(2), (DWC(($enc)?"1":"0")));
&mov(&swtmp(1), "esi");
&mov(&swtmp(0), "ebx");
&call("DES_encrypt2");
&stack_pop(3);
&mov($L,&DWP(0,"ebx","",0));
&mov($R,&DWP(4,"ebx","",0));
&comment("");
&comment("FP");
&FP_new($L,$R,"eax",0);
&mov(&DWP(0,"ebx","",0),"eax");
&mov(&DWP(4,"ebx","",0),$R);
&pop("edi");
&pop("esi");
&pop("ebp");
&pop("ebx");
&ret();
&function_end_B($name);
}

View File

@@ -0,0 +1,617 @@
#!/usr/bin/env perl
# ====================================================================
# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
# <appro@openssl.org>. The module is licensed under 2-clause BSD
# license. March 2013. All rights reserved.
# ====================================================================
######################################################################
# DES for SPARC T4.
#
# As with other hardware-assisted ciphers CBC encrypt results [for
# aligned data] are virtually identical to critical path lengths:
#
# DES Triple-DES
# CBC encrypt 4.14/4.15(*) 11.7/11.7
# CBC decrypt 1.77/4.11(**) 6.42/7.47
#
# (*) numbers after slash are for
# misaligned data;
# (**) this is result for largest
# block size, unlike all other
# cases smaller blocks results
# are better[?];
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "sparcv9_modes.pl";
&asm_init(@ARGV);
$code.=<<___ if ($::abibits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.text
___
{ my ($inp,$out)=("%o0","%o1");
$code.=<<___;
.align 32
.globl des_t4_key_expand
.type des_t4_key_expand,#function
des_t4_key_expand:
andcc $inp, 0x7, %g0
alignaddr $inp, %g0, $inp
bz,pt %icc, 1f
ldd [$inp + 0x00], %f0
ldd [$inp + 0x08], %f2
faligndata %f0, %f2, %f0
1: des_kexpand %f0, 0, %f0
des_kexpand %f0, 1, %f2
std %f0, [$out + 0x00]
des_kexpand %f2, 3, %f6
std %f2, [$out + 0x08]
des_kexpand %f2, 2, %f4
des_kexpand %f6, 3, %f10
std %f6, [$out + 0x18]
des_kexpand %f6, 2, %f8
std %f4, [$out + 0x10]
des_kexpand %f10, 3, %f14
std %f10, [$out + 0x28]
des_kexpand %f10, 2, %f12
std %f8, [$out + 0x20]
des_kexpand %f14, 1, %f16
std %f14, [$out + 0x38]
des_kexpand %f16, 3, %f20
std %f12, [$out + 0x30]
des_kexpand %f16, 2, %f18
std %f16, [$out + 0x40]
des_kexpand %f20, 3, %f24
std %f20, [$out + 0x50]
des_kexpand %f20, 2, %f22
std %f18, [$out + 0x48]
des_kexpand %f24, 3, %f28
std %f24, [$out + 0x60]
des_kexpand %f24, 2, %f26
std %f22, [$out + 0x58]
des_kexpand %f28, 1, %f30
std %f28, [$out + 0x70]
std %f26, [$out + 0x68]
retl
std %f30, [$out + 0x78]
.size des_t4_key_expand,.-des_t4_key_expand
___
}
{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
my ($ileft,$iright,$omask) = map("%g$_",(1..3));
$code.=<<___;
.globl des_t4_cbc_encrypt
.align 32
des_t4_cbc_encrypt:
cmp $len, 0
be,pn $::size_t_cc, .Lcbc_abort
nop
ld [$ivec + 0], %f0 ! load ivec
ld [$ivec + 4], %f1
and $inp, 7, $ileft
andn $inp, 7, $inp
sll $ileft, 3, $ileft
mov 0xff, $omask
prefetch [$inp], 20
prefetch [$inp + 63], 20
sub %g0, $ileft, $iright
and $out, 7, %g4
alignaddrl $out, %g0, $out
srl $omask, %g4, $omask
srlx $len, 3, $len
movrz %g4, 0, $omask
prefetch [$out], 22
ldd [$key + 0x00], %f4 ! load key schedule
ldd [$key + 0x08], %f6
ldd [$key + 0x10], %f8
ldd [$key + 0x18], %f10
ldd [$key + 0x20], %f12
ldd [$key + 0x28], %f14
ldd [$key + 0x30], %f16
ldd [$key + 0x38], %f18
ldd [$key + 0x40], %f20
ldd [$key + 0x48], %f22
ldd [$key + 0x50], %f24
ldd [$key + 0x58], %f26
ldd [$key + 0x60], %f28
ldd [$key + 0x68], %f30
ldd [$key + 0x70], %f32
ldd [$key + 0x78], %f34
.Ldes_cbc_enc_loop:
ldx [$inp + 0], %g4
brz,pt $ileft, 4f
nop
ldx [$inp + 8], %g5
sllx %g4, $ileft, %g4
srlx %g5, $iright, %g5
or %g5, %g4, %g4
4:
movxtod %g4, %f2
prefetch [$inp + 8+63], 20
add $inp, 8, $inp
fxor %f2, %f0, %f0 ! ^= ivec
prefetch [$out + 63], 22
des_ip %f0, %f0
des_round %f4, %f6, %f0, %f0
des_round %f8, %f10, %f0, %f0
des_round %f12, %f14, %f0, %f0
des_round %f16, %f18, %f0, %f0
des_round %f20, %f22, %f0, %f0
des_round %f24, %f26, %f0, %f0
des_round %f28, %f30, %f0, %f0
des_round %f32, %f34, %f0, %f0
des_iip %f0, %f0
brnz,pn $omask, 2f
sub $len, 1, $len
std %f0, [$out + 0]
brnz,pt $len, .Ldes_cbc_enc_loop
add $out, 8, $out
st %f0, [$ivec + 0] ! write out ivec
retl
st %f1, [$ivec + 4]
.Lcbc_abort:
retl
nop
.align 16
2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
! and ~4x deterioration
! in inp==out case
faligndata %f0, %f0, %f2 ! handle unaligned output
stda %f2, [$out + $omask]0xc0 ! partial store
add $out, 8, $out
orn %g0, $omask, $omask
stda %f2, [$out + $omask]0xc0 ! partial store
brnz,pt $len, .Ldes_cbc_enc_loop+4
orn %g0, $omask, $omask
st %f0, [$ivec + 0] ! write out ivec
retl
st %f1, [$ivec + 4]
.type des_t4_cbc_encrypt,#function
.size des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
.globl des_t4_cbc_decrypt
.align 32
des_t4_cbc_decrypt:
cmp $len, 0
be,pn $::size_t_cc, .Lcbc_abort
nop
ld [$ivec + 0], %f2 ! load ivec
ld [$ivec + 4], %f3
and $inp, 7, $ileft
andn $inp, 7, $inp
sll $ileft, 3, $ileft
mov 0xff, $omask
prefetch [$inp], 20
prefetch [$inp + 63], 20
sub %g0, $ileft, $iright
and $out, 7, %g4
alignaddrl $out, %g0, $out
srl $omask, %g4, $omask
srlx $len, 3, $len
movrz %g4, 0, $omask
prefetch [$out], 22
ldd [$key + 0x78], %f4 ! load key schedule
ldd [$key + 0x70], %f6
ldd [$key + 0x68], %f8
ldd [$key + 0x60], %f10
ldd [$key + 0x58], %f12
ldd [$key + 0x50], %f14
ldd [$key + 0x48], %f16
ldd [$key + 0x40], %f18
ldd [$key + 0x38], %f20
ldd [$key + 0x30], %f22
ldd [$key + 0x28], %f24
ldd [$key + 0x20], %f26
ldd [$key + 0x18], %f28
ldd [$key + 0x10], %f30
ldd [$key + 0x08], %f32
ldd [$key + 0x00], %f34
.Ldes_cbc_dec_loop:
ldx [$inp + 0], %g4
brz,pt $ileft, 4f
nop
ldx [$inp + 8], %g5
sllx %g4, $ileft, %g4
srlx %g5, $iright, %g5
or %g5, %g4, %g4
4:
movxtod %g4, %f0
prefetch [$inp + 8+63], 20
add $inp, 8, $inp
prefetch [$out + 63], 22
des_ip %f0, %f0
des_round %f4, %f6, %f0, %f0
des_round %f8, %f10, %f0, %f0
des_round %f12, %f14, %f0, %f0
des_round %f16, %f18, %f0, %f0
des_round %f20, %f22, %f0, %f0
des_round %f24, %f26, %f0, %f0
des_round %f28, %f30, %f0, %f0
des_round %f32, %f34, %f0, %f0
des_iip %f0, %f0
fxor %f2, %f0, %f0 ! ^= ivec
movxtod %g4, %f2
brnz,pn $omask, 2f
sub $len, 1, $len
std %f0, [$out + 0]
brnz,pt $len, .Ldes_cbc_dec_loop
add $out, 8, $out
st %f2, [$ivec + 0] ! write out ivec
retl
st %f3, [$ivec + 4]
.align 16
2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
! and ~4x deterioration
! in inp==out case
faligndata %f0, %f0, %f0 ! handle unaligned output
stda %f0, [$out + $omask]0xc0 ! partial store
add $out, 8, $out
orn %g0, $omask, $omask
stda %f0, [$out + $omask]0xc0 ! partial store
brnz,pt $len, .Ldes_cbc_dec_loop+4
orn %g0, $omask, $omask
st %f2, [$ivec + 0] ! write out ivec
retl
st %f3, [$ivec + 4]
.type des_t4_cbc_decrypt,#function
.size des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
___
# One might wonder why does one have back-to-back des_iip/des_ip
# pairs between EDE passes. Indeed, aren't they inverse of each other?
# They almost are. Outcome of the pair is 32-bit words being swapped
# in target register. Consider pair of des_iip/des_ip as a way to
# perform the due swap, it's actually fastest way in this case.
$code.=<<___;
.globl des_t4_ede3_cbc_encrypt
.align 32
des_t4_ede3_cbc_encrypt:
cmp $len, 0
be,pn $::size_t_cc, .Lcbc_abort
nop
ld [$ivec + 0], %f0 ! load ivec
ld [$ivec + 4], %f1
and $inp, 7, $ileft
andn $inp, 7, $inp
sll $ileft, 3, $ileft
mov 0xff, $omask
prefetch [$inp], 20
prefetch [$inp + 63], 20
sub %g0, $ileft, $iright
and $out, 7, %g4
alignaddrl $out, %g0, $out
srl $omask, %g4, $omask
srlx $len, 3, $len
movrz %g4, 0, $omask
prefetch [$out], 22
ldd [$key + 0x00], %f4 ! load key schedule
ldd [$key + 0x08], %f6
ldd [$key + 0x10], %f8
ldd [$key + 0x18], %f10
ldd [$key + 0x20], %f12
ldd [$key + 0x28], %f14
ldd [$key + 0x30], %f16
ldd [$key + 0x38], %f18
ldd [$key + 0x40], %f20
ldd [$key + 0x48], %f22
ldd [$key + 0x50], %f24
ldd [$key + 0x58], %f26
ldd [$key + 0x60], %f28
ldd [$key + 0x68], %f30
ldd [$key + 0x70], %f32
ldd [$key + 0x78], %f34
.Ldes_ede3_cbc_enc_loop:
ldx [$inp + 0], %g4
brz,pt $ileft, 4f
nop
ldx [$inp + 8], %g5
sllx %g4, $ileft, %g4
srlx %g5, $iright, %g5
or %g5, %g4, %g4
4:
movxtod %g4, %f2
prefetch [$inp + 8+63], 20
add $inp, 8, $inp
fxor %f2, %f0, %f0 ! ^= ivec
prefetch [$out + 63], 22
des_ip %f0, %f0
des_round %f4, %f6, %f0, %f0
des_round %f8, %f10, %f0, %f0
des_round %f12, %f14, %f0, %f0
des_round %f16, %f18, %f0, %f0
ldd [$key + 0x100-0x08], %f36
ldd [$key + 0x100-0x10], %f38
des_round %f20, %f22, %f0, %f0
ldd [$key + 0x100-0x18], %f40
ldd [$key + 0x100-0x20], %f42
des_round %f24, %f26, %f0, %f0
ldd [$key + 0x100-0x28], %f44
ldd [$key + 0x100-0x30], %f46
des_round %f28, %f30, %f0, %f0
ldd [$key + 0x100-0x38], %f48
ldd [$key + 0x100-0x40], %f50
des_round %f32, %f34, %f0, %f0
ldd [$key + 0x100-0x48], %f52
ldd [$key + 0x100-0x50], %f54
des_iip %f0, %f0
ldd [$key + 0x100-0x58], %f56
ldd [$key + 0x100-0x60], %f58
des_ip %f0, %f0
ldd [$key + 0x100-0x68], %f60
ldd [$key + 0x100-0x70], %f62
des_round %f36, %f38, %f0, %f0
ldd [$key + 0x100-0x78], %f36
ldd [$key + 0x100-0x80], %f38
des_round %f40, %f42, %f0, %f0
des_round %f44, %f46, %f0, %f0
des_round %f48, %f50, %f0, %f0
ldd [$key + 0x100+0x00], %f40
ldd [$key + 0x100+0x08], %f42
des_round %f52, %f54, %f0, %f0
ldd [$key + 0x100+0x10], %f44
ldd [$key + 0x100+0x18], %f46
des_round %f56, %f58, %f0, %f0
ldd [$key + 0x100+0x20], %f48
ldd [$key + 0x100+0x28], %f50
des_round %f60, %f62, %f0, %f0
ldd [$key + 0x100+0x30], %f52
ldd [$key + 0x100+0x38], %f54
des_round %f36, %f38, %f0, %f0
ldd [$key + 0x100+0x40], %f56
ldd [$key + 0x100+0x48], %f58
des_iip %f0, %f0
ldd [$key + 0x100+0x50], %f60
ldd [$key + 0x100+0x58], %f62
des_ip %f0, %f0
ldd [$key + 0x100+0x60], %f36
ldd [$key + 0x100+0x68], %f38
des_round %f40, %f42, %f0, %f0
ldd [$key + 0x100+0x70], %f40
ldd [$key + 0x100+0x78], %f42
des_round %f44, %f46, %f0, %f0
des_round %f48, %f50, %f0, %f0
des_round %f52, %f54, %f0, %f0
des_round %f56, %f58, %f0, %f0
des_round %f60, %f62, %f0, %f0
des_round %f36, %f38, %f0, %f0
des_round %f40, %f42, %f0, %f0
des_iip %f0, %f0
brnz,pn $omask, 2f
sub $len, 1, $len
std %f0, [$out + 0]
brnz,pt $len, .Ldes_ede3_cbc_enc_loop
add $out, 8, $out
st %f0, [$ivec + 0] ! write out ivec
retl
st %f1, [$ivec + 4]
.align 16
2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
! and ~2x deterioration
! in inp==out case
faligndata %f0, %f0, %f2 ! handle unaligned output
stda %f2, [$out + $omask]0xc0 ! partial store
add $out, 8, $out
orn %g0, $omask, $omask
stda %f2, [$out + $omask]0xc0 ! partial store
brnz,pt $len, .Ldes_ede3_cbc_enc_loop+4
orn %g0, $omask, $omask
st %f0, [$ivec + 0] ! write out ivec
retl
st %f1, [$ivec + 4]
.type des_t4_ede3_cbc_encrypt,#function
.size des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
.globl des_t4_ede3_cbc_decrypt
.align 32
des_t4_ede3_cbc_decrypt:
cmp $len, 0
be,pn $::size_t_cc, .Lcbc_abort
nop
ld [$ivec + 0], %f2 ! load ivec
ld [$ivec + 4], %f3
and $inp, 7, $ileft
andn $inp, 7, $inp
sll $ileft, 3, $ileft
mov 0xff, $omask
prefetch [$inp], 20
prefetch [$inp + 63], 20
sub %g0, $ileft, $iright
and $out, 7, %g4
alignaddrl $out, %g0, $out
srl $omask, %g4, $omask
srlx $len, 3, $len
movrz %g4, 0, $omask
prefetch [$out], 22
ldd [$key + 0x100+0x78], %f4 ! load key schedule
ldd [$key + 0x100+0x70], %f6
ldd [$key + 0x100+0x68], %f8
ldd [$key + 0x100+0x60], %f10
ldd [$key + 0x100+0x58], %f12
ldd [$key + 0x100+0x50], %f14
ldd [$key + 0x100+0x48], %f16
ldd [$key + 0x100+0x40], %f18
ldd [$key + 0x100+0x38], %f20
ldd [$key + 0x100+0x30], %f22
ldd [$key + 0x100+0x28], %f24
ldd [$key + 0x100+0x20], %f26
ldd [$key + 0x100+0x18], %f28
ldd [$key + 0x100+0x10], %f30
ldd [$key + 0x100+0x08], %f32
ldd [$key + 0x100+0x00], %f34
.Ldes_ede3_cbc_dec_loop:
ldx [$inp + 0], %g4
brz,pt $ileft, 4f
nop
ldx [$inp + 8], %g5
sllx %g4, $ileft, %g4
srlx %g5, $iright, %g5
or %g5, %g4, %g4
4:
movxtod %g4, %f0
prefetch [$inp + 8+63], 20
add $inp, 8, $inp
prefetch [$out + 63], 22
des_ip %f0, %f0
des_round %f4, %f6, %f0, %f0
des_round %f8, %f10, %f0, %f0
des_round %f12, %f14, %f0, %f0
des_round %f16, %f18, %f0, %f0
ldd [$key + 0x80+0x00], %f36
ldd [$key + 0x80+0x08], %f38
des_round %f20, %f22, %f0, %f0
ldd [$key + 0x80+0x10], %f40
ldd [$key + 0x80+0x18], %f42
des_round %f24, %f26, %f0, %f0
ldd [$key + 0x80+0x20], %f44
ldd [$key + 0x80+0x28], %f46
des_round %f28, %f30, %f0, %f0
ldd [$key + 0x80+0x30], %f48
ldd [$key + 0x80+0x38], %f50
des_round %f32, %f34, %f0, %f0
ldd [$key + 0x80+0x40], %f52
ldd [$key + 0x80+0x48], %f54
des_iip %f0, %f0
ldd [$key + 0x80+0x50], %f56
ldd [$key + 0x80+0x58], %f58
des_ip %f0, %f0
ldd [$key + 0x80+0x60], %f60
ldd [$key + 0x80+0x68], %f62
des_round %f36, %f38, %f0, %f0
ldd [$key + 0x80+0x70], %f36
ldd [$key + 0x80+0x78], %f38
des_round %f40, %f42, %f0, %f0
des_round %f44, %f46, %f0, %f0
des_round %f48, %f50, %f0, %f0
ldd [$key + 0x80-0x08], %f40
ldd [$key + 0x80-0x10], %f42
des_round %f52, %f54, %f0, %f0
ldd [$key + 0x80-0x18], %f44
ldd [$key + 0x80-0x20], %f46
des_round %f56, %f58, %f0, %f0
ldd [$key + 0x80-0x28], %f48
ldd [$key + 0x80-0x30], %f50
des_round %f60, %f62, %f0, %f0
ldd [$key + 0x80-0x38], %f52
ldd [$key + 0x80-0x40], %f54
des_round %f36, %f38, %f0, %f0
ldd [$key + 0x80-0x48], %f56
ldd [$key + 0x80-0x50], %f58
des_iip %f0, %f0
ldd [$key + 0x80-0x58], %f60
ldd [$key + 0x80-0x60], %f62
des_ip %f0, %f0
ldd [$key + 0x80-0x68], %f36
ldd [$key + 0x80-0x70], %f38
des_round %f40, %f42, %f0, %f0
ldd [$key + 0x80-0x78], %f40
ldd [$key + 0x80-0x80], %f42
des_round %f44, %f46, %f0, %f0
des_round %f48, %f50, %f0, %f0
des_round %f52, %f54, %f0, %f0
des_round %f56, %f58, %f0, %f0
des_round %f60, %f62, %f0, %f0
des_round %f36, %f38, %f0, %f0
des_round %f40, %f42, %f0, %f0
des_iip %f0, %f0
fxor %f2, %f0, %f0 ! ^= ivec
movxtod %g4, %f2
brnz,pn $omask, 2f
sub $len, 1, $len
std %f0, [$out + 0]
brnz,pt $len, .Ldes_ede3_cbc_dec_loop
add $out, 8, $out
st %f2, [$ivec + 0] ! write out ivec
retl
st %f3, [$ivec + 4]
.align 16
2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
! and ~3x deterioration
! in inp==out case
faligndata %f0, %f0, %f0 ! handle unaligned output
stda %f0, [$out + $omask]0xc0 ! partial store
add $out, 8, $out
orn %g0, $omask, $omask
stda %f0, [$out + $omask]0xc0 ! partial store
brnz,pt $len, .Ldes_ede3_cbc_dec_loop+4
orn %g0, $omask, $omask
st %f2, [$ivec + 0] ! write out ivec
retl
st %f3, [$ivec + 4]
.type des_t4_ede3_cbc_decrypt,#function
.size des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
___
}
$code.=<<___;
.asciz "DES for SPARC T4, David S. Miller, Andy Polyakov"
.align 4
___
&emit_assembler();
close STDOUT;

View File

@@ -0,0 +1,131 @@
First up, let me say I don't like writing in assembler. It is not portable,
dependant on the particular CPU architecture release and is generally a pig
to debug and get right. Having said that, the x86 architecture is probably
the most important for speed due to number of boxes and since
it appears to be the worst architecture to to get
good C compilers for. So due to this, I have lowered myself to do
assembler for the inner DES routines in libdes :-).
The file to implement in assembler is des_enc.c. Replace the following
4 functions
des_encrypt1(DES_LONG data[2],des_key_schedule ks, int encrypt);
des_encrypt2(DES_LONG data[2],des_key_schedule ks, int encrypt);
des_encrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3);
des_decrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3);
They encrypt/decrypt the 64 bits held in 'data' using
the 'ks' key schedules. The only difference between the 4 functions is that
des_encrypt2() does not perform IP() or FP() on the data (this is an
optimization for when doing triple DES and des_encrypt3() and des_decrypt3()
perform triple des. The triple DES routines are in here because it does
make a big difference to have them located near the des_encrypt2 function
at link time..
Now as we all know, there are lots of different operating systems running on
x86 boxes, and unfortunately they normally try to make sure their assembler
formating is not the same as the other peoples.
The 4 main formats I know of are
Microsoft Windows 95/Windows NT
Elf Includes Linux and FreeBSD(?).
a.out The older Linux.
Solaris Same as Elf but different comments :-(.
Now I was not overly keen to write 4 different copies of the same code,
so I wrote a few perl routines to output the correct assembler, given
a target assembler type. This code is ugly and is just a hack.
The libraries are x86unix.pl and x86ms.pl.
des586.pl, des686.pl and des-som[23].pl are the programs to actually
generate the assembler.
So to generate elf assembler
perl des-som3.pl elf >dx86-elf.s
For Windows 95/NT
perl des-som2.pl win32 >win32.asm
[ update 4 Jan 1996 ]
I have added another way to do things.
perl des-som3.pl cpp >dx86-cpp.s
generates a file that will be included by dx86unix.cpp when it is compiled.
To build for elf, a.out, solaris, bsdi etc,
cc -E -DELF asm/dx86unix.cpp | as -o asm/dx86-elf.o
cc -E -DSOL asm/dx86unix.cpp | as -o asm/dx86-sol.o
cc -E -DOUT asm/dx86unix.cpp | as -o asm/dx86-out.o
cc -E -DBSDI asm/dx86unix.cpp | as -o asm/dx86bsdi.o
This was done to cut down the number of files in the distribution.
Now the ugly part. I acquired my copy of Intels
"Optimization's For Intel's 32-Bit Processors" and found a few interesting
things. First, the aim of the exersize is to 'extract' one byte at a time
from a word and do an array lookup. This involves getting the byte from
the 4 locations in the word and moving it to a new word and doing the lookup.
The most obvious way to do this is
xor eax, eax # clear word
movb al, cl # get low byte
xor edi DWORD PTR 0x100+des_SP[eax] # xor in word
movb al, ch # get next byte
xor edi DWORD PTR 0x300+des_SP[eax] # xor in word
shr ecx 16
which seems ok. For the pentium, this system appears to be the best.
One has to do instruction interleaving to keep both functional units
operating, but it is basically very efficient.
Now the crunch. When a full register is used after a partial write, eg.
mov al, cl
xor edi, DWORD PTR 0x100+des_SP[eax]
386 - 1 cycle stall
486 - 1 cycle stall
586 - 0 cycle stall
686 - at least 7 cycle stall (page 22 of the above mentioned document).
So the technique that produces the best results on a pentium, according to
the documentation, will produce hideous results on a pentium pro.
To get around this, des686.pl will generate code that is not as fast on
a pentium, should be very good on a pentium pro.
mov eax, ecx # copy word
shr ecx, 8 # line up next byte
and eax, 0fch # mask byte
xor edi DWORD PTR 0x100+des_SP[eax] # xor in array lookup
mov eax, ecx # get word
shr ecx 8 # line up next byte
and eax, 0fch # mask byte
xor edi DWORD PTR 0x300+des_SP[eax] # xor in array lookup
Due to the execution units in the pentium, this actually works quite well.
For a pentium pro it should be very good. This is the type of output
Visual C++ generates.
There is a third option. instead of using
mov al, ch
which is bad on the pentium pro, one may be able to use
movzx eax, ch
which may not incur the partial write penalty. On the pentium,
this instruction takes 4 cycles so is not worth using but on the
pentium pro it appears it may be worth while. I need access to one to
experiment :-).
eric (20 Oct 1996)
22 Nov 1996 - I have asked people to run the 2 different version on pentium
pros and it appears that the intel documentation is wrong. The
mov al,bh is still faster on a pentium pro, so just use the des586.pl
install des686.pl
3 Dec 1996 - I added des_encrypt3/des_decrypt3 because I have moved these
functions into des_enc.c because it does make a massive performance
difference on some boxes to have the functions code located close to
the des_encrypt2() function.
9 Jan 1997 - des-som2.pl is now the correct perl script to use for
pentiums. It contains an inner loop from
Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> which does raw ecb DES calls at
273,000 per second. He had a previous version at 250,000 and the best
I was able to get was 203,000. The content has not changed, this is all
due to instruction sequencing (and actual instructions choice) which is able
to keep both functional units of the pentium going.
We may have lost the ugly register usage restrictions when x86 went 32 bit
but for the pentium it has been replaced by evil instruction ordering tricks.
13 Jan 1997 - des-som3.pl, more optimizations from Svend Olaf.
raw DES at 281,000 per second on a pentium 100.