Initial Commit

This commit is contained in:
root
2017-02-25 23:55:24 +01:00
commit 1fe2e8ab62
4868 changed files with 1487355 additions and 0 deletions

View File

@@ -0,0 +1,389 @@
#
# OpenSSL/crypto/bn/Makefile
#
DIR= bn
TOP= ../..
CC= cc
CPP= $(CC) -E
INCLUDES= -I.. -I$(TOP) -I../../include
CFLAG=-g
MAKEFILE= Makefile
AR= ar r
BN_ASM= bn_asm.o
CFLAGS= $(INCLUDES) $(CFLAG)
ASFLAGS= $(INCLUDES) $(ASFLAG)
AFLAGS= $(ASFLAGS)
GENERAL=Makefile
TEST=bntest.c exptest.c
APPS=
LIB=$(TOP)/libcrypto.a
LIBSRC= bn_add.c bn_div.c bn_exp.c bn_lib.c bn_ctx.c bn_mul.c bn_mod.c \
bn_print.c bn_rand.c bn_shift.c bn_word.c bn_blind.c \
bn_kron.c bn_sqrt.c bn_gcd.c bn_prime.c bn_err.c bn_sqr.c bn_asm.c \
bn_recp.c bn_mont.c bn_mpi.c bn_exp2.c bn_gf2m.c bn_nist.c \
bn_depr.c bn_const.c bn_x931p.c
LIBOBJ= bn_add.o bn_div.o bn_exp.o bn_lib.o bn_ctx.o bn_mul.o bn_mod.o \
bn_print.o bn_rand.o bn_shift.o bn_word.o bn_blind.o \
bn_kron.o bn_sqrt.o bn_gcd.o bn_prime.o bn_err.o bn_sqr.o $(BN_ASM) \
bn_recp.o bn_mont.o bn_mpi.o bn_exp2.o bn_gf2m.o bn_nist.o \
bn_depr.o bn_const.o bn_x931p.o
SRC= $(LIBSRC)
EXHEADER= bn.h
HEADER= bn_lcl.h bn_prime.h $(EXHEADER)
ALL= $(GENERAL) $(SRC) $(HEADER)
top:
(cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
all: lib
bn_prime.h: bn_prime.pl
$(PERL) bn_prime.pl >bn_prime.h
divtest: divtest.c ../../libcrypto.a
cc -I../../include divtest.c -o divtest ../../libcrypto.a
bnbug: bnbug.c ../../libcrypto.a top
cc -g -I../../include bnbug.c -o bnbug ../../libcrypto.a
lib: $(LIBOBJ)
$(AR) $(LIB) $(LIBOBJ)
$(RANLIB) $(LIB) || echo Never mind.
@touch lib
bn-586.s: asm/bn-586.pl ../perlasm/x86asm.pl
$(PERL) asm/bn-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
co-586.s: asm/co-586.pl ../perlasm/x86asm.pl
$(PERL) asm/co-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
x86-mont.s: asm/x86-mont.pl ../perlasm/x86asm.pl
$(PERL) asm/x86-mont.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
x86-gf2m.s: asm/x86-gf2m.pl ../perlasm/x86asm.pl
$(PERL) asm/x86-gf2m.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
sparcv8.o: asm/sparcv8.S
$(CC) $(CFLAGS) -c asm/sparcv8.S
bn-sparcv9.o: asm/sparcv8plus.S
$(CC) $(CFLAGS) -c -o $@ asm/sparcv8plus.S
sparcv9a-mont.s: asm/sparcv9a-mont.pl
$(PERL) asm/sparcv9a-mont.pl $(CFLAGS) > $@
sparcv9-mont.s: asm/sparcv9-mont.pl
$(PERL) asm/sparcv9-mont.pl $(CFLAGS) > $@
vis3-mont.s: asm/vis3-mont.pl
$(PERL) asm/vis3-mont.pl $(CFLAGS) > $@
sparct4-mont.S: asm/sparct4-mont.pl
$(PERL) asm/sparct4-mont.pl $(CFLAGS) > $@
sparcv9-gf2m.S: asm/sparcv9-gf2m.pl
$(PERL) asm/sparcv9-gf2m.pl $(CFLAGS) > $@
bn-mips3.o: asm/mips3.s
@if [ "$(CC)" = "gcc" ]; then \
ABI=`expr "$(CFLAGS)" : ".*-mabi=\([n3264]*\)"` && \
as -$$ABI -O -o $@ asm/mips3.s; \
else $(CC) -c $(CFLAGS) -o $@ asm/mips3.s; fi
bn-mips.s: asm/mips.pl
$(PERL) asm/mips.pl $(PERLASM_SCHEME) $@
mips-mont.s: asm/mips-mont.pl
$(PERL) asm/mips-mont.pl $(PERLASM_SCHEME) $@
bn-s390x.o: asm/s390x.S
$(CC) $(CFLAGS) -c -o $@ asm/s390x.S
s390x-gf2m.s: asm/s390x-gf2m.pl
$(PERL) asm/s390x-gf2m.pl $(PERLASM_SCHEME) $@
x86_64-gcc.o: asm/x86_64-gcc.c
$(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c
x86_64-mont.s: asm/x86_64-mont.pl
$(PERL) asm/x86_64-mont.pl $(PERLASM_SCHEME) > $@
x86_64-mont5.s: asm/x86_64-mont5.pl
$(PERL) asm/x86_64-mont5.pl $(PERLASM_SCHEME) > $@
x86_64-gf2m.s: asm/x86_64-gf2m.pl
$(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@
rsaz-x86_64.s: asm/rsaz-x86_64.pl
$(PERL) asm/rsaz-x86_64.pl $(PERLASM_SCHEME) > $@
rsaz-avx2.s: asm/rsaz-avx2.pl
$(PERL) asm/rsaz-avx2.pl $(PERLASM_SCHEME) > $@
bn-ia64.s: asm/ia64.S
$(CC) $(CFLAGS) -E asm/ia64.S > $@
ia64-mont.s: asm/ia64-mont.pl
$(PERL) asm/ia64-mont.pl $@ $(CFLAGS)
# GNU assembler fails to compile PA-RISC2 modules, insist on calling
# vendor assembler...
pa-risc2W.o: asm/pa-risc2W.s
/usr/ccs/bin/as -o pa-risc2W.o asm/pa-risc2W.s
pa-risc2.o: asm/pa-risc2.s
/usr/ccs/bin/as -o pa-risc2.o asm/pa-risc2.s
parisc-mont.s: asm/parisc-mont.pl
$(PERL) asm/parisc-mont.pl $(PERLASM_SCHEME) $@
# ppc - AIX, Linux, MacOS X...
bn-ppc.s: asm/ppc.pl; $(PERL) asm/ppc.pl $(PERLASM_SCHEME) $@
ppc-mont.s: asm/ppc-mont.pl;$(PERL) asm/ppc-mont.pl $(PERLASM_SCHEME) $@
ppc64-mont.s: asm/ppc64-mont.pl;$(PERL) asm/ppc64-mont.pl $(PERLASM_SCHEME) $@
alpha-mont.s: asm/alpha-mont.pl
(preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
$(PERL) asm/alpha-mont.pl > $$preproc && \
$(CC) -E -P $$preproc > $@ && rm $$preproc)
# GNU make "catch all"
%-mont.S: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@
%-gf2m.S: asm/%-gf2m.pl; $(PERL) $< $(PERLASM_SCHEME) $@
armv4-mont.o: armv4-mont.S
armv4-gf2m.o: armv4-gf2m.S
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
links:
@$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
@$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
@$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
install:
@[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
@headerlist="$(EXHEADER)"; for i in $$headerlist ; \
do \
(cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
done;
exptest:
rm -f exptest
gcc -I../../include -g2 -ggdb -o exptest exptest.c ../../libcrypto.a
div:
rm -f a.out
gcc -I.. -g div.c ../../libcrypto.a
tags:
ctags $(SRC)
tests:
lint:
lint -DLINT $(INCLUDES) $(SRC)>fluff
update: bn_prime.h depend
depend:
@[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
$(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
dclean:
$(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
mv -f Makefile.new $(MAKEFILE)
clean:
rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
# DO NOT DELETE THIS LINE -- make depend depends on it.
bn_add.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_add.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_add.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_add.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_add.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_add.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_add.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_add.c bn_lcl.h
bn_asm.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_asm.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_asm.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_asm.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_asm.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_asm.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_asm.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_asm.c bn_lcl.h
bn_blind.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_blind.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_blind.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_blind.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_blind.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_blind.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_blind.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_blind.c bn_lcl.h
bn_const.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
bn_const.o: ../../include/openssl/opensslconf.h
bn_const.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_const.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_const.o: ../../include/openssl/symhacks.h bn.h bn_const.c
bn_ctx.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_ctx.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_ctx.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_ctx.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_ctx.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_ctx.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_ctx.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_ctx.c bn_lcl.h
bn_depr.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_depr.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_depr.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_depr.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_depr.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_depr.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
bn_depr.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
bn_depr.o: ../cryptlib.h bn_depr.c bn_lcl.h
bn_div.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_div.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_div.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_div.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_div.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_div.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_div.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_div.c bn_lcl.h
bn_err.o: ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_err.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
bn_err.o: ../../include/openssl/err.h ../../include/openssl/lhash.h
bn_err.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
bn_err.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
bn_err.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
bn_err.o: bn_err.c
bn_exp.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_exp.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_exp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h
bn_exp.o: rsaz_exp.h
bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_exp2.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_exp2.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_exp2.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_exp2.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp2.c bn_lcl.h
bn_gcd.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_gcd.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_gcd.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_gcd.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_gcd.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_gcd.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_gcd.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_gcd.c bn_lcl.h
bn_gf2m.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_gf2m.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_gf2m.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_gf2m.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_gf2m.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_gf2m.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_gf2m.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_gf2m.c bn_lcl.h
bn_kron.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_kron.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_kron.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_kron.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_kron.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_kron.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_kron.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_kron.c bn_lcl.h
bn_lib.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_lib.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_lib.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_lib.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_lib.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_lib.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_lib.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_lib.c
bn_mod.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_mod.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_mod.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_mod.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_mod.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_mod.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_mod.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mod.c
bn_mont.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_mont.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_mont.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_mont.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_mont.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_mont.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_mont.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mont.c
bn_mpi.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_mpi.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_mpi.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_mpi.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_mpi.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_mpi.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_mpi.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mpi.c
bn_mul.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_mul.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_mul.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_mul.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_mul.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_mul.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_mul.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mul.c
bn_nist.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_nist.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_nist.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_nist.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_nist.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_nist.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_nist.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_nist.c
bn_prime.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_prime.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_prime.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_prime.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_prime.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_prime.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
bn_prime.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
bn_prime.o: ../cryptlib.h bn_lcl.h bn_prime.c bn_prime.h
bn_print.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_print.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_print.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_print.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_print.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_print.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_print.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_print.c
bn_rand.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_rand.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_rand.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_rand.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_rand.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_rand.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
bn_rand.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
bn_rand.o: ../cryptlib.h bn_lcl.h bn_rand.c
bn_recp.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_recp.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_recp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_recp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_recp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_recp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_recp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_recp.c
bn_shift.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_shift.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_shift.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_shift.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_shift.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_shift.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_shift.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_shift.c
bn_sqr.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_sqr.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_sqr.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_sqr.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_sqr.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_sqr.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_sqr.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_sqr.c
bn_sqrt.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_sqrt.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_sqrt.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_sqrt.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_sqrt.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_sqrt.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_sqrt.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_sqrt.c
bn_word.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
bn_word.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
bn_word.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
bn_word.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
bn_word.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_word.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_word.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_word.c
bn_x931p.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h
bn_x931p.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h
bn_x931p.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
bn_x931p.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
bn_x931p.o: ../../include/openssl/symhacks.h bn_x931p.c

View File

@@ -0,0 +1,27 @@
<OBSOLETE>
All assember in this directory are just version of the file
crypto/bn/bn_asm.c.
Quite a few of these files are just the assember output from gcc since on
quite a few machines they are 2 times faster than the system compiler.
For the x86, I have hand written assember because of the bad job all
compilers seem to do on it. This normally gives a 2 time speed up in the RSA
routines.
For the DEC alpha, I also hand wrote the assember (except the division which
is just the output from the C compiler pasted on the end of the file).
On the 2 alpha C compilers I had access to, it was not possible to do
64b x 64b -> 128b calculations (both long and the long long data types
were 64 bits). So the hand assember gives access to the 128 bit result and
a 2 times speedup :-).
There are 3 versions of assember for the HP PA-RISC.
pa-risc.s is the origional one which works fine and generated using gcc :-)
pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
by Chris Ruemmler from HP (with some help from the HP C compiler).
</OBSOLETE>

View File

@@ -0,0 +1,321 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# On 21264 RSA sign performance improves by 70/35/20/15 percent for
# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
# instructed to '-tune host' code with in-line assembler. Other
# benchmarks improve by 15-20%. To anchor it to something else, the
# code provides approximately the same performance per GHz as AMD64.
# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
# difference.
# int bn_mul_mont(
$rp="a0"; # BN_ULONG *rp,
$ap="a1"; # const BN_ULONG *ap,
$bp="a2"; # const BN_ULONG *bp,
$np="a3"; # const BN_ULONG *np,
$n0="a4"; # const BN_ULONG *n0,
$num="a5"; # int num);
$lo0="t0";
$hi0="t1";
$lo1="t2";
$hi1="t3";
$aj="t4";
$bi="t5";
$nj="t6";
$tp="t7";
$alo="t8";
$ahi="t9";
$nlo="t10";
$nhi="t11";
$tj="t12";
$i="s3";
$j="s4";
$m1="s5";
$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl bn_mul_mont
.align 5
.ent bn_mul_mont
bn_mul_mont:
lda sp,-48(sp)
stq ra,0(sp)
stq s3,8(sp)
stq s4,16(sp)
stq s5,24(sp)
stq fp,32(sp)
mov sp,fp
.mask 0x0400f000,-48
.frame fp,48,ra
.prologue 0
.align 4
.set reorder
sextl $num,$num
mov 0,v0
cmplt $num,4,AT
bne AT,.Lexit
ldq $hi0,0($ap) # ap[0]
s8addq $num,16,AT
ldq $aj,8($ap)
subq sp,AT,sp
ldq $bi,0($bp) # bp[0]
lda AT,-4096(zero) # mov -4096,AT
ldq $n0,0($n0)
and sp,AT,sp
mulq $hi0,$bi,$lo0
ldq $hi1,0($np) # np[0]
umulh $hi0,$bi,$hi0
ldq $nj,8($np)
mulq $lo0,$n0,$m1
mulq $hi1,$m1,$lo1
umulh $hi1,$m1,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,AT
addq $hi1,AT,$hi1
mulq $aj,$bi,$alo
mov 2,$j
umulh $aj,$bi,$ahi
mov sp,$tp
mulq $nj,$m1,$nlo
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
s8addq $j,$np,$nj
.align 4
.L1st:
.set noreorder
ldq $aj,0($aj)
addl $j,1,$j
ldq $nj,0($nj)
lda $tp,8($tp)
addq $alo,$hi0,$lo0
mulq $aj,$bi,$alo
cmpult $lo0,$hi0,AT
addq $nlo,$hi1,$lo1
mulq $nj,$m1,$nlo
addq $ahi,AT,$hi0
cmpult $lo1,$hi1,v0
cmplt $j,$num,$tj
umulh $aj,$bi,$ahi
addq $nhi,v0,$hi1
addq $lo1,$lo0,$lo1
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
cmpult $lo1,$lo0,v0
addq $hi1,v0,$hi1
s8addq $j,$np,$nj
stq $lo1,-8($tp)
nop
unop
bne $tj,.L1st
.set reorder
addq $alo,$hi0,$lo0
addq $nlo,$hi1,$lo1
cmpult $lo0,$hi0,AT
cmpult $lo1,$hi1,v0
addq $ahi,AT,$hi0
addq $nhi,v0,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,v0
addq $hi1,v0,$hi1
stq $lo1,0($tp)
addq $hi1,$hi0,$hi1
cmpult $hi1,$hi0,AT
stq $hi1,8($tp)
stq AT,16($tp)
mov 1,$i
.align 4
.Louter:
s8addq $i,$bp,$bi
ldq $hi0,0($ap)
ldq $aj,8($ap)
ldq $bi,0($bi)
ldq $hi1,0($np)
ldq $nj,8($np)
ldq $tj,0(sp)
mulq $hi0,$bi,$lo0
umulh $hi0,$bi,$hi0
addq $lo0,$tj,$lo0
cmpult $lo0,$tj,AT
addq $hi0,AT,$hi0
mulq $lo0,$n0,$m1
mulq $hi1,$m1,$lo1
umulh $hi1,$m1,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,AT
mov 2,$j
addq $hi1,AT,$hi1
mulq $aj,$bi,$alo
mov sp,$tp
umulh $aj,$bi,$ahi
mulq $nj,$m1,$nlo
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
.align 4
.Linner:
.set noreorder
ldq $tj,8($tp) #L0
nop #U1
ldq $aj,0($aj) #L1
s8addq $j,$np,$nj #U0
ldq $nj,0($nj) #L0
nop #U1
addq $alo,$hi0,$lo0 #L1
lda $tp,8($tp)
mulq $aj,$bi,$alo #U1
cmpult $lo0,$hi0,AT #L0
addq $nlo,$hi1,$lo1 #L1
addl $j,1,$j
mulq $nj,$m1,$nlo #U1
addq $ahi,AT,$hi0 #L0
addq $lo0,$tj,$lo0 #L1
cmpult $lo1,$hi1,v0 #U0
umulh $aj,$bi,$ahi #U1
cmpult $lo0,$tj,AT #L0
addq $lo1,$lo0,$lo1 #L1
addq $nhi,v0,$hi1 #U0
umulh $nj,$m1,$nhi #U1
s8addq $j,$ap,$aj #L0
cmpult $lo1,$lo0,v0 #L1
cmplt $j,$num,$tj #U0 # borrow $tj
addq $hi0,AT,$hi0 #L0
addq $hi1,v0,$hi1 #U1
stq $lo1,-8($tp) #L1
bne $tj,.Linner #U0
.set reorder
ldq $tj,8($tp)
addq $alo,$hi0,$lo0
addq $nlo,$hi1,$lo1
cmpult $lo0,$hi0,AT
cmpult $lo1,$hi1,v0
addq $ahi,AT,$hi0
addq $nhi,v0,$hi1
addq $lo0,$tj,$lo0
cmpult $lo0,$tj,AT
addq $hi0,AT,$hi0
ldq $tj,16($tp)
addq $lo1,$lo0,$j
cmpult $j,$lo0,v0
addq $hi1,v0,$hi1
addq $hi1,$hi0,$lo1
stq $j,0($tp)
cmpult $lo1,$hi0,$hi1
addq $lo1,$tj,$lo1
cmpult $lo1,$tj,AT
addl $i,1,$i
addq $hi1,AT,$hi1
stq $lo1,8($tp)
cmplt $i,$num,$tj # borrow $tj
stq $hi1,16($tp)
bne $tj,.Louter
s8addq $num,sp,$tj # &tp[num]
mov $rp,$bp # put rp aside
mov sp,$tp
mov sp,$ap
mov 0,$hi0 # clear borrow bit
.align 4
.Lsub: ldq $lo0,0($tp)
ldq $lo1,0($np)
lda $tp,8($tp)
lda $np,8($np)
subq $lo0,$lo1,$lo1 # tp[i]-np[i]
cmpult $lo0,$lo1,AT
subq $lo1,$hi0,$lo0
cmpult $lo1,$lo0,$hi0
or $hi0,AT,$hi0
stq $lo0,0($rp)
cmpult $tp,$tj,v0
lda $rp,8($rp)
bne v0,.Lsub
subq $hi1,$hi0,$hi0 # handle upmost overflow bit
mov sp,$tp
mov $bp,$rp # restore rp
and sp,$hi0,$ap
bic $bp,$hi0,$bp
bis $bp,$ap,$ap # ap=borrow?tp:rp
.align 4
.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
lda $tp,8($tp)
lda $rp,8($rp)
lda $ap,8($ap)
stq zero,-8($tp) # zap tp
cmpult $tp,$tj,AT
stq $aj,-8($rp)
bne AT,.Lcopy
mov 1,v0
.Lexit:
.set noreorder
mov fp,sp
/*ldq ra,0(sp)*/
ldq s3,8(sp)
ldq s4,16(sp)
ldq s5,24(sp)
ldq fp,32(sp)
lda sp,48(sp)
ret (ra)
.end bn_mul_mont
.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
print $code;
close STDOUT;

View File

@@ -0,0 +1,289 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... Except that it has two code paths: pure
# integer code suitable for any ARMv4 and later CPU and NEON code
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
# faster than compiler-generated code. For ECDH and ECDSA verify (but
# not for ECDSA sign) it means 25%-45% improvement depending on key
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
#
# April 2014
#
# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
# referred below, which improves ECDH and ECDSA verify benchmarks
# by 18-40%.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$code=<<___;
#include "arm_arch.h"
.text
.code 32
___
################
# private interface to mul_1x1_ialu
#
$a="r1";
$b="r0";
($a0,$a1,$a2,$a12,$a4,$a14)=
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
$mask="r12";
$code.=<<___;
.type mul_1x1_ialu,%function
.align 5
mul_1x1_ialu:
mov $a0,#0
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
str $a0,[sp,#0] @ tab[0]=0
add $a2,$a1,$a1 @ a2=a1<<1
str $a1,[sp,#4] @ tab[1]=a1
eor $a12,$a1,$a2 @ a1^a2
str $a2,[sp,#8] @ tab[2]=a2
mov $a4,$a1,lsl#2 @ a4=a1<<2
str $a12,[sp,#12] @ tab[3]=a1^a2
eor $a14,$a1,$a4 @ a1^a4
str $a4,[sp,#16] @ tab[4]=a4
eor $a0,$a2,$a4 @ a2^a4
str $a14,[sp,#20] @ tab[5]=a1^a4
eor $a12,$a12,$a4 @ a1^a2^a4
str $a0,[sp,#24] @ tab[6]=a2^a4
and $i0,$mask,$b,lsl#2
str $a12,[sp,#28] @ tab[7]=a1^a2^a4
and $i1,$mask,$b,lsr#1
ldr $lo,[sp,$i0] @ tab[b & 0x7]
and $i0,$mask,$b,lsr#4
ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
and $i1,$mask,$b,lsr#7
ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
eor $lo,$lo,$t1,lsl#3 @ stall
mov $hi,$t1,lsr#29
ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
and $i0,$mask,$b,lsr#10
eor $lo,$lo,$t0,lsl#6
eor $hi,$hi,$t0,lsr#26
ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
and $i1,$mask,$b,lsr#13
eor $lo,$lo,$t1,lsl#9
eor $hi,$hi,$t1,lsr#23
ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
and $i0,$mask,$b,lsr#16
eor $lo,$lo,$t0,lsl#12
eor $hi,$hi,$t0,lsr#20
ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
and $i1,$mask,$b,lsr#19
eor $lo,$lo,$t1,lsl#15
eor $hi,$hi,$t1,lsr#17
ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
and $i0,$mask,$b,lsr#22
eor $lo,$lo,$t0,lsl#18
eor $hi,$hi,$t0,lsr#14
ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
and $i1,$mask,$b,lsr#25
eor $lo,$lo,$t1,lsl#21
eor $hi,$hi,$t1,lsr#11
ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
tst $a,#1<<30
and $i0,$mask,$b,lsr#28
eor $lo,$lo,$t0,lsl#24
eor $hi,$hi,$t0,lsr#8
ldr $t0,[sp,$i0] @ tab[b >> 30 ]
eorne $lo,$lo,$b,lsl#30
eorne $hi,$hi,$b,lsr#2
tst $a,#1<<31
eor $lo,$lo,$t1,lsl#27
eor $hi,$hi,$t1,lsr#5
eorne $lo,$lo,$b,lsl#31
eorne $hi,$hi,$b,lsr#1
eor $lo,$lo,$t0,lsl#30
eor $hi,$hi,$t0,lsr#2
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
___
################
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
{
$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
#if __ARM_MAX_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
tst r12,#1
bne .LNEON
#endif
___
$ret="r10"; # reassigned 1st argument
$code.=<<___;
stmdb sp!,{r4-r10,lr}
mov $ret,r0 @ reassign 1st argument
mov $b,r3 @ $b=b1
ldr r3,[sp,#32] @ load b0
mov $mask,#7<<2
sub sp,sp,#32 @ allocate tab[8]
bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
eor $b,$b,r3 @ flip b0 and b1
eor $a,$a,r2 @ flip a0 and a1
eor r3,r3,$b
eor r2,r2,$a
eor $b,$b,r3
eor $a,$a,r2
bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
eor $a,$a,r2
eor $b,$b,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
ldmia $ret,{@r[0]-@r[3]}
eor $lo,$lo,$hi
eor $hi,$hi,@r[1]
eor $lo,$lo,@r[0]
eor $hi,$hi,@r[2]
eor $lo,$lo,@r[3]
eor $hi,$hi,@r[3]
str $hi,[$ret,#8]
eor $lo,$lo,$hi
add sp,sp,#32 @ destroy tab[8]
str $lo,[$ret,#4]
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
#else
ldmia sp!,{r4-r10,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
___
}
{
my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.align 5
.LNEON:
ldr r12, [sp] @ 5th argument
vmov.32 $a, r2, r1
vmov.32 $b, r12, r3
vmov.i64 $k48, #0x0000ffffffffffff
vmov.i64 $k32, #0x00000000ffffffff
vmov.i64 $k16, #0x000000000000ffff
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
vst1.32 {$r}, [r0]
ret @ bx lr
#endif
___
}
$code.=<<___;
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-(.Lpic+8)
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush

View File

@@ -0,0 +1,676 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2007.
# Montgomery multiplication for ARMv4.
#
# Performance improvement naturally varies among CPU implementations
# and compilers. The code was observed to provide +65-35% improvement
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
# November 2013
#
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
# performance improvement on Cortex-A8 is ~45-100% depending on key
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
# On Snapdragon S4 improvement was measured to vary from ~70% to
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
# rather because original integer-only code seems to perform
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
# different. It's being looked into, but the trouble is that
# performance for vectors longer than 256 bits is actually couple
# of percent worse than for integer-only code. The code is chosen
# for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
$np="r3";
$tp="r4";
$aj="r5";
$nj="r6";
$tj="r7";
$n0="r8";
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
$alo="r10"; # sl, gcc uses it to keep @GOT
$ahi="r11"; # fp
$nlo="r12"; # ip
########### # r13 is stack pointer
$nhi="r14"; # lr
########### # r15 is program counter
#### argument block layout relative to &tp[num-1], a.k.a. $num
$_rp="$num,#12*4";
# ap permanently resides in r1
$_bp="$num,#13*4";
# np permanently resides in r3
$_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
#include "arm_arch.h"
.text
.code 32
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-bn_mul_mont
#endif
.global bn_mul_mont
.type bn_mul_mont,%function
.align 5
bn_mul_mont:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
#if __ARM_MAX_ARCH__>=7
tst ip,#7
bne .Lialu
adr r0,bn_mul_mont
ldr r2,.LOPENSSL_armcap
ldr r0,[r0,r2]
tst r0,#1 @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
add sp,sp,#8
b bn_mul8x_mont_neon
.align 4
.Lialu:
#endif
cmp ip,#2
mov $num,ip @ load num
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
mov $num,$num,lsl#2 @ rescale $num for byte count
sub sp,sp,$num @ alloca(4*num)
sub sp,sp,#4 @ +extra dword
sub $num,$num,#4 @ "num=num-1"
add $tp,$bp,$num @ &bp[num-1]
add $num,sp,$num @ $num to point at &tp[num-1]
ldr $n0,[$_n0] @ &n0
ldr $bi,[$bp] @ bp[0]
ldr $aj,[$ap],#4 @ ap[0],ap++
ldr $nj,[$np],#4 @ np[0],np++
ldr $n0,[$n0] @ *n0
str $tp,[$_bpend] @ save &bp[num]
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
str $n0,[$_n0] @ save n0 value
mul $n0,$alo,$n0 @ "tp[0]"*n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
mov $tp,sp
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .L1st
adds $nlo,$nlo,$ahi
ldr $tp,[$_bp] @ restore bp
mov $nhi,#0
ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,sp @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
ldr $bi,[$tp,#4]! @ *(++bp)
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
ldr $alo,[sp] @ tp[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
str $tp,[$_bp] @ save bp
mul $n0,$alo,$n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
mov $tp,sp
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adc $ahi,$ahi,#0
ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .Linner
adds $nlo,$nlo,$ahi
mov $nhi,#0
ldr $tp,[$_bp] @ restore bp
adc $nhi,$nhi,#0
ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
ldr $tj,[$_bpend] @ restore &bp[num]
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj
bne .Louter
ldr $rp,[$_rp] @ pull rp
add $num,$num,#4 @ $num to point at &tp[num]
sub $aj,$num,sp @ "original" num value
mov $tp,sp @ "rewind" $tp
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
sbcs $tj,$tj,$nj @ tp[j]-np[j]
str $tj,[$rp],#4 @ rp[j]=
teq $tp,$num @ preserve carry
bne .Lsub
sbcs $nhi,$nhi,#0 @ upmost carry
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
and $ap,$tp,$nhi
bic $np,$rp,$nhi
orr $ap,$ap,$np @ ap=borrow?tp:rp
.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
str sp,[$tp],#4 @ zap tp
str $tj,[$rp],#4
cmp $tp,$num
bne .Lcopy
add sp,$num,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt:
#if __ARM_ARCH__>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size bn_mul_mont,.-bn_mul_mont
___
{
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
my ($Z,$Temp)=("q4","q5");
my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
my $zero=&Dlo($Z);
my $temp=&Dlo($Temp);
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type bn_mul8x_mont_neon,%function
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4-r11}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load rest of parameter block
sub $toutptr,sp,#16
vld1.32 {${Bi}[0]}, [$bptr,:32]!
sub $toutptr,$toutptr,$num,lsl#4
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
and $toutptr,$toutptr,#-64
vld1.32 {${M0}[0]}, [$n0,:32]
mov sp,$toutptr @ alloca
veor $zero,$zero,$zero
subs $inner,$num,#8
vzip.16 $Bi,$zero
vmull.u32 $A0xB,$Bi,${A0}[0]
vmull.u32 $A1xB,$Bi,${A0}[1]
vmull.u32 $A2xB,$Bi,${A1}[0]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
vmull.u32 $A3xB,$Bi,${A1}[1]
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
veor $zero,$zero,$zero
vmul.u32 $Ni,$temp,$M0
vmull.u32 $A4xB,$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmull.u32 $A5xB,$Bi,${A2}[1]
vmull.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmull.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_1st
@ special case for num=8, everything is in register bank...
vmlal.u32 $A0xB,$Ni,${N0}[0]
sub $outer,$num,#1
vmlal.u32 $A1xB,$Ni,${N0}[1]
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vmov $Temp,$A0xB
vmlal.u32 $A5xB,$Ni,${N2}[1]
vmov $A0xB,$A1xB
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmov $A1xB,$A2xB
vmlal.u32 $A7xB,$Ni,${N3}[1]
vmov $A2xB,$A3xB
vmov $A3xB,$A4xB
vshr.u64 $temp,$temp,#16
vmov $A4xB,$A5xB
vmov $A5xB,$A6xB
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vmov $A6xB,$A7xB
veor $A7xB,$A7xB
vshr.u64 $temp,$temp,#16
b .LNEON_outer8
.align 4
.LNEON_outer8:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vmlal.u32 $A0xB,$Bi,${A0}[0]
vmlal.u32 $A1xB,$Bi,${A0}[1]
vmlal.u32 $A2xB,$Bi,${A1}[0]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
vmlal.u32 $A3xB,$Bi,${A1}[1]
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
veor $zero,$zero,$zero
subs $outer,$outer,#1
vmul.u32 $Ni,$temp,$M0
vmlal.u32 $A4xB,$Bi,${A2}[0]
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 $A7xB,$Bi,${A3}[1]
vmlal.u32 $A0xB,$Ni,${N0}[0]
vmlal.u32 $A1xB,$Ni,${N0}[1]
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vmov $Temp,$A0xB
vmlal.u32 $A5xB,$Ni,${N2}[1]
vmov $A0xB,$A1xB
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmov $A1xB,$A2xB
vmlal.u32 $A7xB,$Ni,${N3}[1]
vmov $A2xB,$A3xB
vmov $A3xB,$A4xB
vshr.u64 $temp,$temp,#16
vmov $A4xB,$A5xB
vmov $A5xB,$A6xB
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vmov $A6xB,$A7xB
veor $A7xB,$A7xB
vshr.u64 $temp,$temp,#16
bne .LNEON_outer8
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
mov $toutptr,sp
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
mov $inner,$num
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
add $tinptr,sp,#16
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
b .LNEON_tail2
.align 4
.LNEON_1st:
vmlal.u32 $A0xB,$Ni,${N0}[0]
vld1.32 {$A0-$A3}, [$aptr]!
vmlal.u32 $A1xB,$Ni,${N0}[1]
subs $inner,$inner,#8
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vld1.32 {$N0-$N1}, [$nptr]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmull.u32 $A0xB,$Bi,${A0}[0]
vld1.32 {$N2-$N3}, [$nptr]!
vmull.u32 $A1xB,$Bi,${A0}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vmull.u32 $A2xB,$Bi,${A1}[0]
vmull.u32 $A3xB,$Bi,${A1}[1]
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vmull.u32 $A4xB,$Bi,${A2}[0]
vmull.u32 $A5xB,$Bi,${A2}[1]
vmull.u32 $A6xB,$Bi,${A3}[0]
vmull.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_1st
vmlal.u32 $A0xB,$Ni,${N0}[0]
add $tinptr,sp,#16
vmlal.u32 $A1xB,$Ni,${N0}[1]
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
vmlal.u32 $A2xB,$Ni,${N1}[0]
vld1.64 {$Temp}, [sp,:128]
vmlal.u32 $A3xB,$Ni,${N1}[1]
sub $outer,$num,#1
vmlal.u32 $A4xB,$Ni,${N2}[0]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vshr.u64 $temp,$temp,#16
vld1.64 {$A0xB}, [$tinptr, :128]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
veor $Z,$Z,$Z
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vst1.64 {$Z}, [$toutptr,:128]
vshr.u64 $temp,$temp,#16
b .LNEON_outer
.align 4
.LNEON_outer:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
vld1.32 {$A0-$A3}, [$aptr]!
veor $zero,$zero,$zero
mov $toutptr,sp
vzip.16 $Bi,$zero
sub $inner,$num,#8
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vmlal.u32 $A0xB,$Bi,${A0}[0]
vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
vmlal.u32 $A1xB,$Bi,${A0}[1]
vmlal.u32 $A2xB,$Bi,${A1}[0]
vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
vmlal.u32 $A3xB,$Bi,${A1}[1]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
veor $zero,$zero,$zero
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
vld1.64 {$A7xB},[$tinptr,:128]!
vmul.u32 $Ni,$temp,$M0
vmlal.u32 $A4xB,$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 $A7xB,$Bi,${A3}[1]
.LNEON_inner:
vmlal.u32 $A0xB,$Ni,${N0}[0]
vld1.32 {$A0-$A3}, [$aptr]!
vmlal.u32 $A1xB,$Ni,${N0}[1]
subs $inner,$inner,#8
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A4xB,$Ni,${N2}[0]
vld1.64 {$A0xB}, [$tinptr, :128]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vmlal.u32 $A0xB,$Bi,${A0}[0]
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
vmlal.u32 $A1xB,$Bi,${A0}[1]
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vmlal.u32 $A2xB,$Bi,${A1}[0]
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
vmlal.u32 $A3xB,$Bi,${A1}[1]
vld1.32 {$N0-$N3}, [$nptr]!
vmlal.u32 $A4xB,$Bi,${A2}[0]
vld1.64 {$A7xB}, [$tinptr, :128]!
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vmlal.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_inner
vmlal.u32 $A0xB,$Ni,${N0}[0]
add $tinptr,sp,#16
vmlal.u32 $A1xB,$Ni,${N0}[1]
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
vmlal.u32 $A2xB,$Ni,${N1}[0]
vld1.64 {$Temp}, [sp,:128]
vmlal.u32 $A3xB,$Ni,${N1}[1]
subs $outer,$outer,#1
vmlal.u32 $A4xB,$Ni,${N2}[0]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vld1.64 {$A0xB}, [$tinptr, :128]!
vshr.u64 $temp,$temp,#16
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vshr.u64 $temp,$temp,#16
bne .LNEON_outer
mov $toutptr,sp
mov $inner,$num
.LNEON_tail:
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
vld1.64 {$A7xB}, [$tinptr, :128]!
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
.LNEON_tail2:
vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A1xB")`,#16
vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
vshr.u64 $temp,`&Dhi("$A1xB")`,#16
vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A2xB")`,#16
vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
vshr.u64 $temp,`&Dhi("$A2xB")`,#16
vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A3xB")`,#16
vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
vshr.u64 $temp,`&Dhi("$A3xB")`,#16
vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A4xB")`,#16
vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
vshr.u64 $temp,`&Dhi("$A4xB")`,#16
vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A5xB")`,#16
vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
vshr.u64 $temp,`&Dhi("$A5xB")`,#16
vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A6xB")`,#16
vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
vld1.64 {$A0xB}, [$tinptr, :128]!
vshr.u64 $temp,`&Dhi("$A6xB")`,#16
vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A7xB")`,#16
vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dhi("$A7xB")`,#16
vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
subs $inner,$inner,#8
vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
bne .LNEON_tail
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
subs $aptr,sp,#0 @ clear carry flag
add $bptr,sp,$num,lsl#2
.LNEON_sub:
ldmia $aptr!, {r4-r7}
ldmia $nptr!, {r8-r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_sub
ldr r10, [$aptr] @ load top-most bit
veor q0,q0,q0
sub r11,$bptr,sp @ this is num*4
veor q1,q1,q1
mov $aptr,sp
sub $rptr,$rptr,r11 @ rewind $rptr
mov $nptr,$bptr @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
.LNEON_copy_n_zap:
ldmia $aptr!, {r4-r7}
ldmia $rptr, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r11,r7
ldmia $aptr, {r4-r7}
stmia $rptr!, {r8-r11}
sub $aptr,$aptr,#16
ldmia $rptr, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r11,r7
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_copy_n_zap
sub sp,ip,#96
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
ret @ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
___
}
$code.=<<___;
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT;

View File

@@ -0,0 +1,774 @@
#!/usr/local/bin/perl
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&bn_mul_add_words("bn_mul_add_words");
&bn_mul_words("bn_mul_words");
&bn_sqr_words("bn_sqr_words");
&bn_div_words("bn_div_words");
&bn_add_words("bn_add_words");
&bn_sub_words("bn_sub_words");
&bn_sub_part_words("bn_sub_part_words");
&asm_finish();
sub bn_mul_add_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("maw_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
&jmp(&label("maw_sse2_entry"));
&set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
&movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
&pmuludq("mm2","mm0"); # mm2 = w*a[0]
&movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
&pmuludq("mm4","mm0"); # mm4 = w*a[1]
&movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
&pmuludq("mm6","mm0"); # mm6 = w*a[2]
&movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
&pmuludq("mm7","mm0"); # mm7 = w*a[3]
&paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
&movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
&paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
&movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
&paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
&movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
&paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
&movd(&DWP(0,$r,"",0),"mm1");
&movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
&pmuludq("mm2","mm0"); # mm2 = w*a[4]
&psrlq("mm1",32); # mm1 = carry0
&movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
&pmuludq("mm4","mm0"); # mm4 = w*a[5]
&paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
&movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
&pmuludq("mm6","mm0"); # mm6 = w*a[6]
&movd(&DWP(4,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry1
&movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
&add($a,32);
&pmuludq("mm3","mm0"); # mm3 = w*a[7]
&paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
&movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
&paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
&movd(&DWP(8,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry2
&paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
&movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
&paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
&movd(&DWP(12,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry3
&paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
&movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
&paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
&movd(&DWP(16,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry4
&paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
&movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
&paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
&movd(&DWP(20,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry5
&paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
&movd(&DWP(24,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry6
&paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
&movd(&DWP(28,$r,"",0),"mm1");
&lea($r,&DWP(32,$r));
&psrlq("mm1",32); # mm1 = carry_out
&sub($c,8);
&jz(&label("maw_sse2_exit"));
&set_label("maw_sse2_entry");
&test($c,0xfffffff8);
&jnz(&label("maw_sse2_unrolled"));
&set_label("maw_sse2_loop",4);
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
&movd("mm3",&DWP(0,$r)); # mm3 = r[i]
&pmuludq("mm2","mm0"); # a[i] *= w
&lea($a,&DWP(4,$a));
&paddq("mm1","mm3"); # carry += r[i]
&paddq("mm1","mm2"); # carry += a[i]*w
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
&sub($c,1);
&psrlq("mm1",32); # carry = carry_high
&lea($r,&DWP(4,$r));
&jnz(&label("maw_sse2_loop"));
&set_label("maw_sse2_exit");
&movd("eax","mm1"); # c = carry_out
&emms();
&ret();
&set_label("maw_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ebp";
$r="edi";
$c="esi";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov("ecx",&wparam(2)); #
&mov($a,&wparam(1)); #
&and("ecx",0xfffffff8); # num / 8
&mov($w,&wparam(3)); #
&push("ecx"); # Up the stack for a tmp variable
&jz(&label("maw_finish"));
&set_label("maw_loop",16);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+= c
&adc("edx",0); # H(t)+=carry
&add("eax",&DWP($i,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&sub("ecx",8);
&lea($a,&DWP(32,$a));
&lea($r,&DWP(32,$r));
&jnz(&label("maw_loop"));
&set_label("maw_finish",0);
&mov("ecx",&wparam(2)); # get num
&and("ecx",7);
&jnz(&label("maw_finish2")); # helps branch prediction
&jmp(&label("maw_end"));
&set_label("maw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
&adc("edx",0); # H(t)+=carry
&add("eax",&DWP($i*4,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&dec("ecx") if ($i != 7-1);
&mov(&DWP($i*4,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
&jz(&label("maw_end")) if ($i != 7-1);
}
&set_label("maw_end",0);
&mov("eax",$c);
&pop("ecx"); # clear variable from
&function_end($name);
}
sub bn_mul_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("mw_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry = 0
&set_label("mw_sse2_loop",16);
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
&pmuludq("mm2","mm0"); # a[i] *= w
&lea($a,&DWP(4,$a));
&paddq("mm1","mm2"); # carry += a[i]*w
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
&sub($c,1);
&psrlq("mm1",32); # carry = carry_high
&lea($r,&DWP(4,$r));
&jnz(&label("mw_sse2_loop"));
&movd("eax","mm1"); # return carry
&emms();
&ret();
&set_label("mw_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ecx";
$r="edi";
$c="esi";
$num="ebp";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&mov($w,&wparam(3)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("mw_finish"));
&set_label("mw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jz(&label("mw_finish"));
&jmp(&label("mw_loop"));
&set_label("mw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jnz(&label("mw_finish2"));
&jmp(&label("mw_end"));
&set_label("mw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
&mov($c,"edx"); # c= H(t);
&dec($num) if ($i != 7-1);
&jz(&label("mw_end")) if ($i != 7-1);
}
&set_label("mw_end",0);
&mov("eax",$c);
&function_end($name);
}
sub bn_sqr_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("sqr_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&set_label("sqr_sse2_loop",16);
&movd("mm0",&DWP(0,$a)); # mm0 = a[i]
&pmuludq("mm0","mm0"); # a[i] *= a[i]
&lea($a,&DWP(4,$a)); # a++
&movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
&sub($c,1);
&lea($r,&DWP(8,$r)); # r += 2
&jnz(&label("sqr_sse2_loop"));
&emms();
&ret();
&set_label("sqr_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$r="esi";
$a="edi";
$num="ebx";
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("sw_finish"));
&set_label("sw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*2,$r,"",0),"eax"); #
&mov(&DWP($i*2+4,$r,"",0),"edx");#
}
&comment("");
&add($a,32);
&add($r,64);
&sub($num,8);
&jnz(&label("sw_loop"));
&set_label("sw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jz(&label("sw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*8,$r,"",0),"eax"); #
&dec($num) if ($i != 7-1);
&mov(&DWP($i*8+4,$r,"",0),"edx");
&jz(&label("sw_end")) if ($i != 7-1);
}
&set_label("sw_end",0);
&function_end($name);
}
sub bn_div_words
{
local($name)=@_;
&function_begin_B($name,"");
&mov("edx",&wparam(0)); #
&mov("eax",&wparam(1)); #
&mov("ecx",&wparam(2)); #
&div("ecx");
&ret();
&function_end_B($name);
}
sub bn_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
sub bn_sub_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
sub bn_sub_part_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP(0,$a,"",0)); # *a
&mov($tmp2,&DWP(0,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP(0,$r,"",0),$tmp1); # *r
&add($a, 4);
&add($b, 4);
&add($r, 4);
&dec($num) if ($i != 6);
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
&cmp(&wparam(4),0);
&je(&label("pw_end"));
&mov($num,&wparam(4)); # get dl
&cmp($num,0);
&je(&label("pw_end"));
&jge(&label("pw_pos"));
&comment("pw_neg");
&mov($tmp2,0);
&sub($tmp2,$num);
&mov($num,$tmp2);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_neg_finish"));
&set_label("pw_neg_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("dl<0 Round $i");
&mov($tmp1,0);
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_neg_loop"));
&set_label("pw_neg_finish",0);
&mov($tmp2,&wparam(4)); # get dl
&mov($num,0);
&sub($num,$tmp2);
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl<0 Tail Round $i");
&mov($tmp1,0);
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("pw_end")) if ($i != 6);
}
&jmp(&label("pw_end"));
&set_label("pw_pos",0);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_pos_finish"));
&set_label("pw_pos_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("dl>0 Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&sub($tmp1,$c);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_nc".$i));
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_pos_loop"));
&set_label("pw_pos_finish",0);
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl>0 Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&sub($tmp1,$c);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_tail_nc".$i));
&dec($num) if ($i != 6);
&jz(&label("pw_end")) if ($i != 6);
}
&mov($c,1);
&jmp(&label("pw_end"));
&set_label("pw_nc_loop",0);
for ($i=0; $i<8; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_nc".$i,0);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_nc_loop"));
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_nc_end"));
for ($i=0; $i<7; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_tail_nc".$i,0);
&dec($num) if ($i != 6);
&jz(&label("pw_nc_end")) if ($i != 6);
}
&set_label("pw_nc_end",0);
&mov($c,0);
&set_label("pw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}

View File

@@ -0,0 +1,287 @@
#!/usr/local/bin/perl
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
&bn_mul_comba("bn_mul_comba8",8);
&bn_mul_comba("bn_mul_comba4",4);
&bn_sqr_comba("bn_sqr_comba8",8);
&bn_sqr_comba("bn_sqr_comba4",4);
&asm_finish();
sub mul_add_c
{
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("mul a[$ai]*b[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
&mul("edx");
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
sub sqr_add_c
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
sub sqr_add_c2
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$a,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add("eax","eax");
###
&adc("edx","edx");
###
&adc($c2,0);
&add($c0,"eax");
&adc($c1,"edx");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
&adc($c2,0);
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
###
}
sub bn_mul_comba
{
local($name,$num)=@_;
local($a,$b,$c0,$c1,$c2);
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($tot,$end);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
&push("esi");
&mov($a,&wparam(1));
&push("edi");
&mov($b,&wparam(2));
&push("ebp");
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($j+1) == $end)
{
$v=1;
$v=2 if (($i+1) == $tot);
}
else
{ $v=0; }
if (($j+1) != $end)
{
$na=($ai-1);
$nb=($bi+1);
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
if ($v)
{
&comment("saved r[$i]");
# &mov("eax",&wparam(0));
# &mov(&DWP($i*4,"eax","",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&comment("save r[$i]");
# &mov("eax",&wparam(0));
&mov(&DWP($i*4,"eax","",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
sub bn_sqr_comba
{
local($name,$num)=@_;
local($r,$a,$c0,$c1,$c2)=@_;
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($b,$tot,$end,$half);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$r="edi";
&push("esi");
&push("edi");
&push("ebp");
&push("ebx");
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&xor($c0,$c0);
&xor($c1,$c1);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("############### Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($ai-1) < ($bi+1))
{
$v=1;
$v=2 if ($i+1) == $tot;
}
else
{ $v=0; }
if (!$v)
{
$na=$ai-1;
$nb=$bi+1;
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
if ($ai == $bi)
{
&sqr_add_c($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
else
{
&sqr_add_c2($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
if ($v)
{
&comment("saved r[$i]");
#&mov(&DWP($i*4,$r,"",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
last;
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&mov(&DWP($i*4,$r,"",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}

View File

@@ -0,0 +1,851 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2010
#
# "Teaser" Montgomery multiplication module for IA-64. There are
# several possibilities for improvement:
#
# - modulo-scheduling outer loop would eliminate quite a number of
# stalls after ldf8, xma and getf.sig outside inner loop and
# improve shorter key performance;
# - shorter vector support [with input vectors being fetched only
# once] should be added;
# - 2x unroll with help of n0[1] would make the code scalable on
# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
# acute interest, because upcoming Tukwila's individual cores are
# reportedly based on Itanium 2 design;
# - dedicated squaring procedure(?);
#
# January 2010
#
# Shorter vector support is implemented by zero-padding ap and np
# vectors up to 8 elements, or 512 bits. This means that 256-bit
# inputs will be processed only 2 times faster than 512-bit inputs,
# not 4 [as one would expect, because algorithm complexity is n^2].
# The reason for padding is that inputs shorter than 512 bits won't
# be processed faster anyway, because minimal critical path of the
# core loop happens to match 512-bit timing. Either way, it resulted
# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
# 1024-bit one [in comparison to original version of *this* module].
#
# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
# this module is:
# sign verify sign/s verify/s
# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
#
# ... and *without* (but still with ia64.S):
#
# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
#
# As it can be seen, RSA sign performance improves by 130-30%,
# hereafter less for longer keys, while verify - by 74-13%.
# DSA performance improves by 115-30%.
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
$code=<<___;
.explicit
.text
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
// const BN_ULONG *bp,const BN_ULONG *np,
// const BN_ULONG *n0p,int num);
.align 64
.global bn_mul_mont#
.proc bn_mul_mont#
bn_mul_mont:
.prologue
.body
{ .mmi; cmp4.le p6,p7=2,r37;;
(p6) cmp4.lt.unc p8,p9=8,r37
mov ret0=r0 };;
{ .bbb;
(p9) br.cond.dptk.many bn_mul_mont_8
(p8) br.cond.dpnt.many bn_mul_mont_general
(p7) br.ret.spnt.many b0 };;
.endp bn_mul_mont#
prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
rptr=r8; aptr=r9; bptr=r14; nptr=r15;
tptr=r16; // &tp[0]
tp_1=r17; // &tp[-1]
num=r18; len=r19; lc=r20;
topbit=r21; // carry bit from tmp[num]
n0=f6;
m0=f7;
bi=f8;
.align 64
.local bn_mul_mont_general#
.proc bn_mul_mont_general#
bn_mul_mont_general:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,6,2,0,8
$ADDP aptr=0,in1
.save ar.lc,prevlc
mov prevlc=ar.lc }
{ .mmi; .vframe prevsp
mov prevsp=sp
$ADDP bptr=0,in2
.save pr,prevpr
mov prevpr=pr };;
.body
.rotf alo[6],nlo[4],ahi[8],nhi[6]
.rotr a[3],n[3],t[2]
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
ldf8 alo[4]=[aptr],16 // ap[0]
$ADDP r30=8,in1 };;
{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
ldf8 alo[2]=[aptr],16 // ap[2]
$ADDP in4=0,in4 };;
{ .mmi; ldf8 alo[1]=[r30] // ap[3]
ldf8 n0=[in4] // n0
$ADDP rptr=0,in0 }
{ .mmi; $ADDP nptr=0,in3
mov r31=16
zxt4 num=in5 };;
{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
shladd len=num,3,r0
shladd r31=num,3,r31 };;
{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
add lc=-5,num
sub r31=sp,r31 };;
{ .mfb; and sp=-16,r31 // alloca
xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
nop.b 0 }
{ .mfb; nop.m 0
xmpy.lu alo[4]=alo[4],bi
brp.loop.imp .L1st_ctop,.L1st_cend-16
};;
{ .mfi; nop.m 0
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
add tp_1=8,sp }
{ .mfi; nop.m 0
xma.lu alo[3]=alo[3],bi,ahi[2]
mov pr.rot=0x20001f<<16
// ------^----- (p40) at first (p23)
// ----------^^ p[16:20]=1
};;
{ .mfi; nop.m 0
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
mov ar.lc=lc }
{ .mfi; nop.m 0
fcvt.fxu.s1 nhi[1]=f0
mov ar.ec=8 };;
.align 32
.L1st_ctop:
.pred.rel "mutex",p40,p42
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
(p40) add n[2]=n[2],a[2] } // (p23) }
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
(p42) add n[2]=n[2],a[2],1 };; // (p23)
{ .mfi; (p21) getf.sig a[0]=alo[5]
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
(p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
{ .mfi; (p23) st8 [tp_1]=n[2],8
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
{ .mmb; (p21) getf.sig n[0]=nlo[3]
(p16) nop.m 0
br.ctop.sptk .L1st_ctop };;
.L1st_cend:
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
getf.sig n[0]=nhi[4]
add num=-1,num };; // num--
{ .mmi; .pred.rel "mutex",p40,p42
(p40) add n[0]=n[0],a[0]
(p42) add n[0]=n[0],a[0],1
sub aptr=aptr,len };; // rewind
{ .mmi; .pred.rel "mutex",p40,p42
(p40) cmp.ltu p41,p39=n[0],a[0]
(p42) cmp.leu p41,p39=n[0],a[0]
sub nptr=nptr,len };;
{ .mmi; .pred.rel "mutex",p39,p41
(p39) add topbit=r0,r0
(p41) add topbit=r0,r0,1
nop.i 0 }
{ .mmi; st8 [tp_1]=n[0]
add tptr=16,sp
add tp_1=8,sp };;
.Louter:
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
ldf8 ahi[3]=[tptr] // tp[0]
add r30=8,aptr };;
{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
ldf8 alo[3]=[r30],16 // ap[1]
add r31=8,nptr };;
{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
brp.loop.imp .Linner_ctop,.Linner_cend-16
}
{ .mfb; ldf8 alo[1]=[r30] // ap[3]
xma.lu alo[4]=alo[4],bi,ahi[3]
clrrrb.pr };;
{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
nop.i 0 }
{ .mfi; ldf8 nlo[1]=[r31] // np[1]
xma.lu alo[3]=alo[3],bi,ahi[2]
mov pr.rot=0x20101f<<16
// ------^----- (p40) at first (p23)
// --------^--- (p30) at first (p22)
// ----------^^ p[16:20]=1
};;
{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
mov ar.lc=lc }
{ .mfi;
fcvt.fxu.s1 nhi[1]=f0
mov ar.ec=8 };;
// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
// in latter case accounts for two-tick pipeline stall, which means
// that its performance would be ~20% lower than optimal one. No
// attempt was made to address this, because original Itanium is
// hardly represented out in the wild...
.align 32
.Linner_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p30,p32
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
(p40) add n[2]=n[2],a[2] } // (p23)
{ .mfi; (p16) nop.m 0
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
(p42) add n[2]=n[2],a[2],1 };; // (p23)
{ .mfi; (p21) getf.sig a[0]=alo[5]
(p16) nop.f 0
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
{ .mfi; (p21) ld8 t[0]=[tptr],8
(p16) nop.f 0
(p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
(p30) add a[1]=a[1],t[1] } // (p22)
{ .mfi; (p16) nop.m 0
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
(p32) add a[1]=a[1],t[1],1 };; // (p22)
{ .mmi; (p21) getf.sig n[0]=nlo[3]
(p16) nop.m 0
(p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
{ .mmb; (p23) st8 [tp_1]=n[2],8
(p32) cmp.leu p31,p29=a[1],t[1] // (p22)
br.ctop.sptk .Linner_ctop };;
.Linner_cend:
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
getf.sig n[0]=nhi[4]
nop.i 0 };;
{ .mmi; .pred.rel "mutex",p31,p33
(p31) add a[0]=a[0],topbit
(p33) add a[0]=a[0],topbit,1
mov topbit=r0 };;
{ .mfi; .pred.rel "mutex",p31,p33
(p31) cmp.ltu p32,p30=a[0],topbit
(p33) cmp.leu p32,p30=a[0],topbit
}
{ .mfi; .pred.rel "mutex",p40,p42
(p40) add n[0]=n[0],a[0]
(p42) add n[0]=n[0],a[0],1
};;
{ .mmi; .pred.rel "mutex",p44,p46
(p40) cmp.ltu p41,p39=n[0],a[0]
(p42) cmp.leu p41,p39=n[0],a[0]
(p32) add topbit=r0,r0,1 }
{ .mmi; st8 [tp_1]=n[0],8
cmp4.ne p6,p0=1,num
sub aptr=aptr,len };; // rewind
{ .mmi; sub nptr=nptr,len
(p41) add topbit=r0,r0,1
add tptr=16,sp }
{ .mmb; add tp_1=8,sp
add num=-1,num // num--
(p6) br.cond.sptk.many .Louter };;
{ .mbb; add lc=4,lc
brp.loop.imp .Lsub_ctop,.Lsub_cend-16
clrrrb.pr };;
{ .mii; nop.m 0
mov pr.rot=0x10001<<16
// ------^---- (p33) at first (p17)
mov ar.lc=lc }
{ .mii; nop.m 0
mov ar.ec=3
nop.i 0 };;
.Lsub_ctop:
.pred.rel "mutex",p33,p35
{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
(p16) nop.f 0
(p33) sub n[1]=t[1],n[1] } // (p17)
{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
(p16) nop.f 0
(p35) sub n[1]=t[1],n[1],1 };; // (p17)
{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
(p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
(p18) nop.b 0 }
{ .mib; (p18) nop.m 0
(p35) cmp.geu p34,p32=n[1],t[1] // (p17)
br.ctop.sptk .Lsub_ctop };;
.Lsub_cend:
{ .mmb; .pred.rel "mutex",p34,p36
(p34) sub topbit=topbit,r0 // (p19)
(p36) sub topbit=topbit,r0,1
brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
}
{ .mmb; sub rptr=rptr,len // rewind
sub tptr=tptr,len
clrrrb.pr };;
{ .mmi; and aptr=tptr,topbit
andcm bptr=rptr,topbit
mov pr.rot=1<<16 };;
{ .mii; or nptr=aptr,bptr
mov ar.lc=lc
mov ar.ec=3 };;
.Lcopy_ctop:
{ .mmb; (p16) ld8 n[0]=[nptr],8
(p18) st8 [tptr]=r0,8
(p16) nop.b 0 }
{ .mmb; (p16) nop.m 0
(p18) st8 [rptr]=n[2],8
br.ctop.sptk .Lcopy_ctop };;
.Lcopy_cend:
{ .mmi; mov ret0=1 // signal "handled"
rum 1<<5 // clear um.mfh
mov ar.lc=prevlc }
{ .mib; .restore sp
mov sp=prevsp
mov pr=prevpr,0x1ffff
br.ret.sptk.many b0 };;
.endp bn_mul_mont_general#
a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
t0=r15;
ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
.align 64
.skip 48 // aligns loop body
.local bn_mul_mont_8#
.proc bn_mul_mont_8#
bn_mul_mont_8:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,6,2,0,8
.vframe prevsp
mov prevsp=sp
.save ar.lc,prevlc
mov prevlc=ar.lc }
{ .mmi; add r17=-6*16,sp
add sp=-7*16,sp
.save pr,prevpr
mov prevpr=pr };;
{ .mmi; .save.gf 0,0x10
stf.spill [sp]=f16,-16
.save.gf 0,0x20
stf.spill [r17]=f17,32
add r16=-5*16,prevsp};;
{ .mmi; .save.gf 0,0x40
stf.spill [r16]=f18,32
.save.gf 0,0x80
stf.spill [r17]=f19,32
$ADDP aptr=0,in1 };;
{ .mmi; .save.gf 0,0x100
stf.spill [r16]=f20,32
.save.gf 0,0x200
stf.spill [r17]=f21,32
$ADDP r29=8,in1 };;
{ .mmi; .save.gf 0,0x400
stf.spill [r16]=f22
.save.gf 0,0x800
stf.spill [r17]=f23
$ADDP rptr=0,in0 };;
.body
.rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
.rotr t[8]
// load input vectors padding them to 8 elements
{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
ldf8 ai1=[r29],16 // ap[1]
$ADDP bptr=0,in2 }
{ .mmi; $ADDP r30=8,in2
$ADDP nptr=0,in3
$ADDP r31=8,in3 };;
{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
ldf8 bj[6]=[r30],16 // bp[1]
cmp4.le p4,p5=3,in5 }
{ .mmi; ldf8 ni0=[nptr],16 // np[0]
ldf8 ni1=[r31],16 // np[1]
cmp4.le p6,p7=4,in5 };;
{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
(p5)fcvt.fxu ai2=f0
cmp4.le p8,p9=5,in5 }
{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
(p7)fcvt.fxu ai3=f0
cmp4.le p10,p11=6,in5 }
{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
(p5)fcvt.fxu bj[5]=f0
cmp4.le p12,p13=7,in5 }
{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
(p7)fcvt.fxu bj[4]=f0
cmp4.le p14,p15=8,in5 }
{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
(p5)fcvt.fxu ni2=f0
addp4 r28=-1,in5 }
{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
(p7)fcvt.fxu ni3=f0
$ADDP in4=0,in4 };;
{ .mfi; ldf8 n0=[in4]
fcvt.fxu tf[1]=f0
nop.i 0 }
{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
(p9)fcvt.fxu ai4=f0
mov t[0]=r0 }
{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
(p11)fcvt.fxu ai5=f0
mov t[1]=r0 }
{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
(p9)fcvt.fxu bj[3]=f0
mov t[2]=r0 }
{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
(p11)fcvt.fxu bj[2]=f0
mov t[3]=r0 }
{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
(p9)fcvt.fxu ni4=f0
mov t[4]=r0 }
{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
(p11)fcvt.fxu ni5=f0
mov t[5]=r0 };;
{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
(p13)fcvt.fxu ai6=f0
mov t[6]=r0 }
{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
(p15)fcvt.fxu ai7=f0
mov t[7]=r0 }
{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
(p13)fcvt.fxu bj[1]=f0
mov ar.lc=r28 }
{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
(p15)fcvt.fxu bj[0]=f0
mov ar.ec=1 }
{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
(p13)fcvt.fxu ni6=f0
mov pr.rot=1<<16 }
{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
(p15)fcvt.fxu ni7=f0
brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
};;
// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
// to measure with help of Interval Time Counter indicated that the
// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
// addressing the issue is problematic, because I don't have access
// to platform-specific instruction-level profiler. On Itanium it
// should run in 56*n ticks, because of higher xma latency...
.Louter_8_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 0:
(p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
(p40) add a3=a3,n3 } // (p17) a3+=n3
{ .mfi; (p42) add a3=a3,n3,1
(p16) xma.lu alo[0]=ai0,bj[7],tf[1]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p16) nop.m 0 // 4:
(p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
(p41) add a4=a4,n4 } // (p17) a4+=n4
{ .mfi; (p43) add a4=a4,n4,1
(p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
(p16) nop.i 0 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p16) nop.m 0 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 8:
(p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
(p40) add a5=a5,n5 } // (p17) a5+=n5
{ .mfi; (p42) add a5=a5,n5,1
(p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a1=alo[1] // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mfi; (p16) nop.m 0 // 10:
(p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
(p40) cmp.ltu p43,p41=a5,n5 }
{ .mfi; (p42) cmp.leu p43,p41=a5,n5
(p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
(p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
(p41) add a6=a6,n6 } // (p17) a6+=n6
{ .mfi; (p43) add a6=a6,n6,1
(p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a2=alo[2] // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mfi; (p16) nop.m 0 // 14:
(p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
(p41) cmp.ltu p42,p40=a6,n6 }
{ .mfi; (p43) cmp.leu p42,p40=a6,n6
(p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
(p16) nop.i 0 };;
{ .mii; (p16) nop.m 0 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 16:
(p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
(p40) add a7=a7,n7 } // (p17) a7+=n7
{ .mfi; (p42) add a7=a7,n7,1
(p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a3=alo[3] // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mfi; (p16) nop.m 0 // 18:
(p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
(p40) cmp.ltu p43,p41=a7,n7 }
{ .mfi; (p42) cmp.leu p43,p41=a7,n7
(p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n1=nlo[1] // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p16) nop.m 0 // 20:
(p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
(p41) add a8=a8,n8 } // (p17) a8+=n8
{ .mfi; (p43) add a8=a8,n8,1
(p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a4=alo[4] // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 };;
{ .mfi; (p16) nop.m 0 // 22:
(p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
(p41) cmp.ltu p42,p40=a8,n8 }
{ .mfi; (p43) cmp.leu p42,p40=a8,n8
(p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n2=nlo[2] // 23:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 };;
{ .mfi; (p16) nop.m 0 // 24:
(p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
(p16) add a1=a1,n1 } // (p16) a1+=n1
{ .mfi; (p16) nop.m 0
(p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
(p17) mov t[0]=r0 };;
{ .mii; (p16) getf.sig a5=alo[5] // 25:
(p16) add t0=t[7],a1 // (p16) t[7]+=a1
(p42) add t[0]=t[0],r0,1 };;
{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
(p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
(p50) add t[0]=t[0],r0,1 }
{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
(p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n3=nlo[3] // 27:
(p16) cmp.ltu.unc p50,p48=t0,a1
(p16) nop.i 0 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 28:
(p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
(p40) add a2=a2,n2 } // (p16) a2+=n2
{ .mfi; (p42) add a2=a2,n2,1
(p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a6=alo[6] // 29:
(p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
(p50) add t[6]=t[6],a2,1 };;
{ .mfi; (p16) nop.m 0 // 30:
(p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
(p40) cmp.ltu p41,p39=a2,n2 }
{ .mfi; (p42) cmp.leu p41,p39=a2,n2
(p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
(p16) nop.i 0 };;
{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
(p16) nop.f 0
(p48) cmp.ltu p49,p47=t[6],a2 }
{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
(p16) nop.f 0
br.ctop.sptk.many .Louter_8_ctop };;
.Louter_8_cend:
// above loop has to execute one more time, without (p16), which is
// replaced with merged move of np[8] to GPR bank
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mmi; (p0) getf.sig n1=ni0 // 0:
(p40) add a3=a3,n3 // (p17) a3+=n3
(p42) add a3=a3,n3,1 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mmi; (p0) getf.sig n2=ni1 // 4:
(p41) add a4=a4,n4 // (p17) a4+=n4
(p43) add a4=a4,n4,1 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p0) nop.f 0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p0) getf.sig n3=ni2 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) getf.sig n4=ni3 // 8:
(p40) add a5=a5,n5 // (p17) a5+=n5
(p42) add a5=a5,n5,1 };;
{ .mii; (p0) nop.m 0 // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mii; (p0) nop.m 0 // 10:
(p40) cmp.ltu p43,p41=a5,n5
(p42) cmp.leu p43,p41=a5,n5 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p17) getf.sig n8=nhi[8] // 12:
(p41) add a6=a6,n6 // (p17) a6+=n6
(p43) add a6=a6,n6,1 };;
{ .mii; (p0) getf.sig n5=ni4 // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mii; (p0) nop.m 0 // 14:
(p41) cmp.ltu p42,p40=a6,n6
(p43) cmp.leu p42,p40=a6,n6 };;
{ .mii; (p0) getf.sig n6=ni5 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) nop.m 0 // 16:
(p40) add a7=a7,n7 // (p17) a7+=n7
(p42) add a7=a7,n7,1 };;
{ .mii; (p0) nop.m 0 // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mii; (p0) nop.m 0 // 18:
(p40) cmp.ltu p43,p41=a7,n7
(p42) cmp.leu p43,p41=a7,n7 };;
{ .mii; (p0) getf.sig n7=ni6 // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p0) nop.m 0 // 20:
(p41) add a8=a8,n8 // (p17) a8+=n8
(p43) add a8=a8,n8,1 };;
{ .mmi; (p0) nop.m 0 // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 }
{ .mmi; (p17) mov t[0]=r0
(p41) cmp.ltu p42,p40=a8,n8
(p43) cmp.leu p42,p40=a8,n8 };;
{ .mmi; (p0) getf.sig n8=ni7 // 22:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 }
{ .mmi; (p42) add t[0]=t[0],r0,1
(p0) add r16=-7*16,prevsp
(p0) add r17=-6*16,prevsp };;
// subtract np[8] from carrybit|tmp[8]
// carrybit|tmp[8] layout upon exit from above loop is:
// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
{ .mmi; (p50)add t[0]=t[0],r0,1
add r18=-5*16,prevsp
sub n1=t0,n1 };;
{ .mmi; cmp.gtu p34,p32=n1,t0;;
.pred.rel "mutex",p32,p34
(p32)sub n2=t[7],n2
(p34)sub n2=t[7],n2,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
(p34)cmp.geu p35,p33=n2,t[7];;
.pred.rel "mutex",p33,p35
(p33)sub n3=t[6],n3 }
{ .mmi; (p35)sub n3=t[6],n3,1;;
(p33)cmp.gtu p34,p32=n3,t[6]
(p35)cmp.geu p34,p32=n3,t[6] };;
.pred.rel "mutex",p32,p34
{ .mii; (p32)sub n4=t[5],n4
(p34)sub n4=t[5],n4,1;;
(p32)cmp.gtu p35,p33=n4,t[5] }
{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
.pred.rel "mutex",p33,p35
(p33)sub n5=t[4],n5
(p35)sub n5=t[4],n5,1 };;
{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
(p35)cmp.geu p34,p32=n5,t[4];;
.pred.rel "mutex",p32,p34
(p32)sub n6=t[3],n6 }
{ .mmi; (p34)sub n6=t[3],n6,1;;
(p32)cmp.gtu p35,p33=n6,t[3]
(p34)cmp.geu p35,p33=n6,t[3] };;
.pred.rel "mutex",p33,p35
{ .mii; (p33)sub n7=t[2],n7
(p35)sub n7=t[2],n7,1;;
(p33)cmp.gtu p34,p32=n7,t[2] }
{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
.pred.rel "mutex",p32,p34
(p32)sub n8=t[1],n8
(p34)sub n8=t[1],n8,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
(p34)cmp.geu p35,p33=n8,t[1];;
.pred.rel "mutex",p33,p35
(p33)sub a8=t[0],r0 }
{ .mmi; (p35)sub a8=t[0],r0,1;;
(p33)cmp.gtu p34,p32=a8,t[0]
(p35)cmp.geu p34,p32=a8,t[0] };;
// save the result, either tmp[num] or tmp[num]-np[num]
.pred.rel "mutex",p32,p34
{ .mmi; (p32)st8 [rptr]=n1,8
(p34)st8 [rptr]=t0,8
add r19=-4*16,prevsp};;
{ .mmb; (p32)st8 [rptr]=n2,8
(p34)st8 [rptr]=t[7],8
(p5)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n3,8
(p34)st8 [rptr]=t[6],8
(p7)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n4,8
(p34)st8 [rptr]=t[5],8
(p9)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n5,8
(p34)st8 [rptr]=t[4],8
(p11)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n6,8
(p34)st8 [rptr]=t[3],8
(p13)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n7,8
(p34)st8 [rptr]=t[2],8
(p15)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n8,8
(p34)st8 [rptr]=t[1],8
nop.b 0 };;
.Ldone: // epilogue
{ .mmi; ldf.fill f16=[r16],64
ldf.fill f17=[r17],64
nop.i 0 }
{ .mmi; ldf.fill f18=[r18],64
ldf.fill f19=[r19],64
mov pr=prevpr,0x1ffff };;
{ .mmi; ldf.fill f20=[r16]
ldf.fill f21=[r17]
mov ar.lc=prevlc }
{ .mmi; ldf.fill f22=[r18]
ldf.fill f23=[r19]
mov ret0=1 } // signal "handled"
{ .mib; rum 1<<5
.restore sp
mov sp=prevsp
br.ret.sptk.many b0 };;
.endp bn_mul_mont_8#
.type copyright#,\@object
copyright:
stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,426 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# This module doesn't present direct interest for OpenSSL, because it
# doesn't provide better performance for longer keys, at least not on
# in-order-execution cores. While 512-bit RSA sign operations can be
# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
# verify:-( All comparisons are against bn_mul_mont-free assembler.
# The module might be of interest to embedded system developers, as
# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
# code.
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp;
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
$PTR_SUB="dsub"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$SZREG=8;
} else {
$PTR_ADD="add";
$PTR_SUB="sub";
$REG_S="sw";
$REG_L="lw";
$SZREG=4;
}
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
#
# <appro@openssl.org>
#
######################################################################
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
if ($flavour =~ /64|n32/i) {
$LD="ld";
$ST="sd";
$MULTU="dmultu";
$ADDU="daddu";
$SUBU="dsubu";
$BNSZ=8;
} else {
$LD="lw";
$ST="sw";
$MULTU="multu";
$ADDU="addu";
$SUBU="subu";
$BNSZ=4;
}
# int bn_mul_mont(
$rp=$a0; # BN_ULONG *rp,
$ap=$a1; # const BN_ULONG *ap,
$bp=$a2; # const BN_ULONG *bp,
$np=$a3; # const BN_ULONG *np,
$n0=$a4; # const BN_ULONG *n0,
$num=$a5; # int num);
$lo0=$a6;
$hi0=$a7;
$lo1=$t1;
$hi1=$t2;
$aj=$s0;
$bi=$s1;
$nj=$s2;
$tp=$s3;
$alo=$s4;
$ahi=$s5;
$nlo=$s6;
$nhi=$s7;
$tj=$s8;
$i=$s9;
$j=$s10;
$m1=$s11;
$FRAMESIZE=14;
$code=<<___;
.text
.set noat
.set noreorder
.align 5
.globl bn_mul_mont
.ent bn_mul_mont
bn_mul_mont:
___
$code.=<<___ if ($flavour =~ /o32/i);
lw $n0,16($sp)
lw $num,20($sp)
___
$code.=<<___;
slt $at,$num,4
bnez $at,1f
li $t0,0
slt $at,$num,17 # on in-order CPU
bnez $at,bn_mul_mont_internal
nop
1: jr $ra
li $a0,0
.end bn_mul_mont
.align 5
.ent bn_mul_mont_internal
bn_mul_mont_internal:
.frame $fp,$FRAMESIZE*$SZREG,$ra
.mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
$PTR_SUB $sp,$FRAMESIZE*$SZREG
$REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
$REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
$REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
$REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
$REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
$REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
$REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
$REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
$REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
$REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
$REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
$REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
___
$code.=<<___;
move $fp,$sp
.set reorder
$LD $n0,0($n0)
$LD $bi,0($bp) # bp[0]
$LD $aj,0($ap) # ap[0]
$LD $nj,0($np) # np[0]
$PTR_SUB $sp,2*$BNSZ # place for two extra words
sll $num,`log($BNSZ)/log(2)`
li $at,-4096
$PTR_SUB $sp,$num
and $sp,$at
$MULTU $aj,$bi
$LD $alo,$BNSZ($ap)
$LD $nlo,$BNSZ($np)
mflo $lo0
mfhi $hi0
$MULTU $lo0,$n0
mflo $m1
$MULTU $alo,$bi
mflo $alo
mfhi $ahi
$MULTU $nj,$m1
mflo $lo1
mfhi $hi1
$MULTU $nlo,$m1
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
mflo $nlo
mfhi $nhi
move $tp,$sp
li $j,2*$BNSZ
.align 4
.L1st:
.set noreorder
$PTR_ADD $aj,$ap,$j
$PTR_ADD $nj,$np,$j
$LD $aj,($aj)
$LD $nj,($nj)
$MULTU $aj,$bi
$ADDU $lo0,$alo,$hi0
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo0,$hi0
sltu $t0,$lo1,$hi1
$ADDU $hi0,$ahi,$at
$ADDU $hi1,$nhi,$t0
mflo $alo
mfhi $ahi
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$MULTU $nj,$m1
$ADDU $hi1,$at
addu $j,$BNSZ
$ST $lo1,($tp)
sltu $t0,$j,$num
mflo $nlo
mfhi $nhi
bnez $t0,.L1st
$PTR_ADD $tp,$BNSZ
.set reorder
$ADDU $lo0,$alo,$hi0
sltu $at,$lo0,$hi0
$ADDU $hi0,$ahi,$at
$ADDU $lo1,$nlo,$hi1
sltu $t0,$lo1,$hi1
$ADDU $hi1,$nhi,$t0
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
$ST $lo1,($tp)
$ADDU $hi1,$hi0
sltu $at,$hi1,$hi0
$ST $hi1,$BNSZ($tp)
$ST $at,2*$BNSZ($tp)
li $i,$BNSZ
.align 4
.Louter:
$PTR_ADD $bi,$bp,$i
$LD $bi,($bi)
$LD $aj,($ap)
$LD $alo,$BNSZ($ap)
$LD $tj,($sp)
$MULTU $aj,$bi
$LD $nj,($np)
$LD $nlo,$BNSZ($np)
mflo $lo0
mfhi $hi0
$ADDU $lo0,$tj
$MULTU $lo0,$n0
sltu $at,$lo0,$tj
$ADDU $hi0,$at
mflo $m1
$MULTU $alo,$bi
mflo $alo
mfhi $ahi
$MULTU $nj,$m1
mflo $lo1
mfhi $hi1
$MULTU $nlo,$m1
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
mflo $nlo
mfhi $nhi
move $tp,$sp
li $j,2*$BNSZ
$LD $tj,$BNSZ($tp)
.align 4
.Linner:
.set noreorder
$PTR_ADD $aj,$ap,$j
$PTR_ADD $nj,$np,$j
$LD $aj,($aj)
$LD $nj,($nj)
$MULTU $aj,$bi
$ADDU $lo0,$alo,$hi0
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo0,$hi0
sltu $t0,$lo1,$hi1
$ADDU $hi0,$ahi,$at
$ADDU $hi1,$nhi,$t0
mflo $alo
mfhi $ahi
$ADDU $lo0,$tj
addu $j,$BNSZ
$MULTU $nj,$m1
sltu $at,$lo0,$tj
$ADDU $lo1,$lo0
$ADDU $hi0,$at
sltu $t0,$lo1,$lo0
$LD $tj,2*$BNSZ($tp)
$ADDU $hi1,$t0
sltu $at,$j,$num
mflo $nlo
mfhi $nhi
$ST $lo1,($tp)
bnez $at,.Linner
$PTR_ADD $tp,$BNSZ
.set reorder
$ADDU $lo0,$alo,$hi0
sltu $at,$lo0,$hi0
$ADDU $hi0,$ahi,$at
$ADDU $lo0,$tj
sltu $t0,$lo0,$tj
$ADDU $hi0,$t0
$LD $tj,2*$BNSZ($tp)
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo1,$hi1
$ADDU $hi1,$nhi,$at
$ADDU $lo1,$lo0
sltu $t0,$lo1,$lo0
$ADDU $hi1,$t0
$ST $lo1,($tp)
$ADDU $lo1,$hi1,$hi0
sltu $hi1,$lo1,$hi0
$ADDU $lo1,$tj
sltu $at,$lo1,$tj
$ADDU $hi1,$at
$ST $lo1,$BNSZ($tp)
$ST $hi1,2*$BNSZ($tp)
addu $i,$BNSZ
sltu $t0,$i,$num
bnez $t0,.Louter
.set noreorder
$PTR_ADD $tj,$sp,$num # &tp[num]
move $tp,$sp
move $ap,$sp
li $hi0,0 # clear borrow bit
.align 4
.Lsub: $LD $lo0,($tp)
$LD $lo1,($np)
$PTR_ADD $tp,$BNSZ
$PTR_ADD $np,$BNSZ
$SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
sgtu $at,$lo1,$lo0
$SUBU $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
$ST $lo0,($rp)
or $hi0,$at
sltu $at,$tp,$tj
bnez $at,.Lsub
$PTR_ADD $rp,$BNSZ
$SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
move $tp,$sp
$PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
and $ap,$hi0,$sp
and $bp,$hi1,$rp
or $ap,$ap,$bp # ap=borrow?tp:rp
.align 4
.Lcopy: $LD $aj,($ap)
$PTR_ADD $ap,$BNSZ
$ST $zero,($tp)
$PTR_ADD $tp,$BNSZ
sltu $at,$tp,$tj
$ST $aj,($rp)
bnez $at,.Lcopy
$PTR_ADD $rp,$BNSZ
li $a0,1
li $t0,1
.set noreorder
move $sp,$fp
$REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
$REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
$REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
$REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
$REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
$REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
$REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
$REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
$REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
$REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
$REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
$REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE*$SZREG
.end bn_mul_mont_internal
.rdata
.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,327 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# This module doesn't present direct interest for OpenSSL, because it
# doesn't provide better performance for longer keys. While 512-bit
# RSA private key operations are 40% faster, 1024-bit ones are hardly
# faster at all, while longer key operations are slower by up to 20%.
# It might be of interest to embedded system developers though, as
# it's smaller than 1KB, yet offers ~3x improvement over compiler
# generated code.
#
# The module targets N32 and N64 MIPS ABIs and currently is a bit
# IRIX-centric, i.e. is likely to require adaptation for other OSes.
# int bn_mul_mont(
$rp="a0"; # BN_ULONG *rp,
$ap="a1"; # const BN_ULONG *ap,
$bp="a2"; # const BN_ULONG *bp,
$np="a3"; # const BN_ULONG *np,
$n0="a4"; # const BN_ULONG *n0,
$num="a5"; # int num);
$lo0="a6";
$hi0="a7";
$lo1="v0";
$hi1="v1";
$aj="t0";
$bi="t1";
$nj="t2";
$tp="t3";
$alo="s0";
$ahi="s1";
$nlo="s2";
$nhi="s3";
$tj="s4";
$i="s5";
$j="s6";
$fp="t8";
$m1="t9";
$FRAME=8*(2+8);
$code=<<___;
#include <asm.h>
#include <regdef.h>
.text
.set noat
.set reorder
.align 5
.globl bn_mul_mont
.ent bn_mul_mont
bn_mul_mont:
.set noreorder
PTR_SUB sp,64
move $fp,sp
.frame $fp,64,ra
slt AT,$num,4
li v0,0
beqzl AT,.Lproceed
nop
jr ra
PTR_ADD sp,$fp,64
.set reorder
.align 5
.Lproceed:
ld $n0,0($n0)
ld $bi,0($bp) # bp[0]
ld $aj,0($ap) # ap[0]
ld $nj,0($np) # np[0]
PTR_SUB sp,16 # place for two extra words
sll $num,3
li AT,-4096
PTR_SUB sp,$num
and sp,AT
sd s0,0($fp)
sd s1,8($fp)
sd s2,16($fp)
sd s3,24($fp)
sd s4,32($fp)
sd s5,40($fp)
sd s6,48($fp)
sd s7,56($fp)
dmultu $aj,$bi
ld $alo,8($ap)
ld $nlo,8($np)
mflo $lo0
mfhi $hi0
dmultu $lo0,$n0
mflo $m1
dmultu $alo,$bi
mflo $alo
mfhi $ahi
dmultu $nj,$m1
mflo $lo1
mfhi $hi1
dmultu $nlo,$m1
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
daddu $hi1,AT
mflo $nlo
mfhi $nhi
move $tp,sp
li $j,16
.align 4
.L1st:
.set noreorder
PTR_ADD $aj,$ap,$j
ld $aj,($aj)
PTR_ADD $nj,$np,$j
ld $nj,($nj)
dmultu $aj,$bi
daddu $lo0,$alo,$hi0
daddu $lo1,$nlo,$hi1
sltu AT,$lo0,$hi0
sltu s7,$lo1,$hi1
daddu $hi0,$ahi,AT
daddu $hi1,$nhi,s7
mflo $alo
mfhi $ahi
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
dmultu $nj,$m1
daddu $hi1,AT
addu $j,8
sd $lo1,($tp)
sltu s7,$j,$num
mflo $nlo
mfhi $nhi
bnez s7,.L1st
PTR_ADD $tp,8
.set reorder
daddu $lo0,$alo,$hi0
sltu AT,$lo0,$hi0
daddu $hi0,$ahi,AT
daddu $lo1,$nlo,$hi1
sltu s7,$lo1,$hi1
daddu $hi1,$nhi,s7
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
daddu $hi1,AT
sd $lo1,($tp)
daddu $hi1,$hi0
sltu AT,$hi1,$hi0
sd $hi1,8($tp)
sd AT,16($tp)
li $i,8
.align 4
.Louter:
PTR_ADD $bi,$bp,$i
ld $bi,($bi)
ld $aj,($ap)
ld $alo,8($ap)
ld $tj,(sp)
dmultu $aj,$bi
ld $nj,($np)
ld $nlo,8($np)
mflo $lo0
mfhi $hi0
daddu $lo0,$tj
dmultu $lo0,$n0
sltu AT,$lo0,$tj
daddu $hi0,AT
mflo $m1
dmultu $alo,$bi
mflo $alo
mfhi $ahi
dmultu $nj,$m1
mflo $lo1
mfhi $hi1
dmultu $nlo,$m1
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
daddu $hi1,AT
mflo $nlo
mfhi $nhi
move $tp,sp
li $j,16
ld $tj,8($tp)
.align 4
.Linner:
.set noreorder
PTR_ADD $aj,$ap,$j
ld $aj,($aj)
PTR_ADD $nj,$np,$j
ld $nj,($nj)
dmultu $aj,$bi
daddu $lo0,$alo,$hi0
daddu $lo1,$nlo,$hi1
sltu AT,$lo0,$hi0
sltu s7,$lo1,$hi1
daddu $hi0,$ahi,AT
daddu $hi1,$nhi,s7
mflo $alo
mfhi $ahi
daddu $lo0,$tj
addu $j,8
dmultu $nj,$m1
sltu AT,$lo0,$tj
daddu $lo1,$lo0
daddu $hi0,AT
sltu s7,$lo1,$lo0
ld $tj,16($tp)
daddu $hi1,s7
sltu AT,$j,$num
mflo $nlo
mfhi $nhi
sd $lo1,($tp)
bnez AT,.Linner
PTR_ADD $tp,8
.set reorder
daddu $lo0,$alo,$hi0
sltu AT,$lo0,$hi0
daddu $hi0,$ahi,AT
daddu $lo0,$tj
sltu s7,$lo0,$tj
daddu $hi0,s7
ld $tj,16($tp)
daddu $lo1,$nlo,$hi1
sltu AT,$lo1,$hi1
daddu $hi1,$nhi,AT
daddu $lo1,$lo0
sltu s7,$lo1,$lo0
daddu $hi1,s7
sd $lo1,($tp)
daddu $lo1,$hi1,$hi0
sltu $hi1,$lo1,$hi0
daddu $lo1,$tj
sltu AT,$lo1,$tj
daddu $hi1,AT
sd $lo1,8($tp)
sd $hi1,16($tp)
addu $i,8
sltu s7,$i,$num
bnez s7,.Louter
.set noreorder
PTR_ADD $tj,sp,$num # &tp[num]
move $tp,sp
move $ap,sp
li $hi0,0 # clear borrow bit
.align 4
.Lsub: ld $lo0,($tp)
ld $lo1,($np)
PTR_ADD $tp,8
PTR_ADD $np,8
dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
sgtu AT,$lo1,$lo0
dsubu $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
sd $lo0,($rp)
or $hi0,AT
sltu AT,$tp,$tj
bnez AT,.Lsub
PTR_ADD $rp,8
dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
move $tp,sp
PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
and $ap,$hi0,sp
and $bp,$hi1,$rp
or $ap,$ap,$bp # ap=borrow?tp:rp
.align 4
.Lcopy: ld $aj,($ap)
PTR_ADD $ap,8
PTR_ADD $tp,8
sd zero,-8($tp)
sltu AT,$tp,$tj
sd $aj,($rp)
bnez AT,.Lcopy
PTR_ADD $rp,8
ld s0,0($fp)
ld s1,8($fp)
ld s2,16($fp)
ld s3,24($fp)
ld s4,32($fp)
ld s5,40($fp)
ld s6,48($fp)
ld s7,56($fp)
li v0,1
jr ra
PTR_ADD sp,$fp,64
.set reorder
END(bn_mul_mont)
.rdata
.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,995 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# On PA-7100LC this module performs ~90-50% better, less for longer
# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
# that compiler utilized xmpyu instruction to perform 32x32=64-bit
# multiplication, which in turn means that "baseline" performance was
# optimal in respect to instruction set capabilities. Fair comparison
# with vendor compiler is problematic, because OpenSSL doesn't define
# BN_LLONG [presumably] for historical reasons, which drives compiler
# toward 4 times 16x16=32-bit multiplicatons [plus complementary
# shifts and additions] instead. This means that you should observe
# several times improvement over code generated by vendor compiler
# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
# improvement coefficient was never collected on PA-7100LC, or any
# other 1.1 CPU, because I don't have access to such machine with
# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
# of ~5x on PA-8600.
#
# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
# reportedly ~2x faster than vendor compiler generated code [according
# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
# this implementation is actually 32-bit one, in the sense that it
# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
# 64-bit BN_LONGs... How do they interoperate then? No problem. This
# module picks halves of 64-bit values in reverse order and pretends
# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
# i.e. there is no "wider" multiplication like on most other 64-bit
# platforms. This means that even being effectively 32-bit, this
# implementation performs "64-bit" computational task in same amount
# of arithmetic operations, most notably multiplications. It requires
# more memory references, most notably to tp[num], but this doesn't
# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
# 2.0 code path provides virtually same performance as pa-risc2[W].s:
# it's ~10% better for shortest key length and ~10% worse for longest
# one.
#
# In case it wasn't clear. The module has two distinct code paths:
# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
# additions and 64-bit integer loads, not to mention specific
# instruction scheduling. In 64-bit build naturally only 2.0 code path
# is assembled. In 32-bit application context both code paths are
# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
# is taken automatically. Also, in 32-bit build the module imposes
# couple of limitations: vector lengths has to be even and vector
# addresses has to be 64-bit aligned. Normally neither is a problem:
# most common key lengths are even and vectors are commonly malloc-ed,
# which ensures alignment.
#
# Special thanks to polarhome.com for providing HP-UX account on
# PA-RISC 1.1 machine, and to correspondent who chose to remain
# anonymous for testing the code on PA-RISC 2.0 machine.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
$BN_SZ =$SIZE_T;
} else {
$LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
$BN_SZ =$SIZE_T;
if (open CONF,"<${dir}../../opensslconf.h") {
while(<CONF>) {
if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
$BN_SZ=8;
$LEVEL="2.0";
last;
}
}
close CONF;
}
}
$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
# [+ argument transfer]
$LOCALS=$FRAME-$FRAME_MARKER;
$FRAME+=32; # local variables
$tp="%r31";
$ti1="%r29";
$ti0="%r28";
$rp="%r26";
$ap="%r25";
$bp="%r24";
$np="%r23";
$n0="%r22"; # passed through stack in 32-bit
$num="%r21"; # passed through stack in 32-bit
$idx="%r20";
$arrsz="%r19";
$nm1="%r7";
$nm0="%r6";
$ab1="%r5";
$ab0="%r4";
$fp="%r3";
$hi1="%r2";
$hi0="%r1";
$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
$fm0="%fr4"; $fti=$fm0;
$fbi="%fr5L";
$fn0="%fr5R";
$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
.ALIGN 64
bn_mul_mont
.PROC
.CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
ldo -$FRAME(%sp),$fp
___
$code.=<<___ if ($SIZE_T==4);
ldw `-$FRAME_MARKER-4`($fp),$n0
ldw `-$FRAME_MARKER-8`($fp),$num
nop
nop ; alignment
___
$code.=<<___ if ($BN_SZ==4);
comiclr,<= 6,$num,%r0 ; are vectors long enough?
b L\$abort
ldi 0,%r28 ; signal "unhandled"
add,ev %r0,$num,$num ; is $num even?
b L\$abort
nop
or $ap,$np,$ti1
extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
b L\$abort
nop
nop ; alignment
nop
fldws 0($n0),${fn0}
fldws,ma 4($bp),${fbi} ; bp[0]
___
$code.=<<___ if ($BN_SZ==8);
comib,> 3,$num,L\$abort ; are vectors long enough?
ldi 0,%r28 ; signal "unhandled"
addl $num,$num,$num ; I operate on 32-bit values
fldws 4($n0),${fn0} ; only low part of n0
fldws 4($bp),${fbi} ; bp[0] in flipped word order
___
$code.=<<___;
fldds 0($ap),${fai} ; ap[0,1]
fldds 0($np),${fni} ; np[0,1]
sh2addl $num,%r0,$arrsz
ldi 31,$hi0
ldo 36($arrsz),$hi1 ; space for tp[num+1]
andcm $hi1,$hi0,$hi1 ; align
addl $hi1,%sp,%sp
$PUSH $fp,-$SIZE_T(%sp)
ldo `$LOCALS+16`($fp),$xfer
ldo `$LOCALS+32+4`($fp),$tp
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
xmpyu ${fn0},${fab0}R,${fm0}
addl $arrsz,$ap,$ap ; point at the end
addl $arrsz,$np,$np
subi 0,$arrsz,$idx ; j=0
ldo 8($idx),$idx ; j++++
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
fstds ${fab1},0($xfer)
fstds ${fnm1},8($xfer)
flddx $idx($ap),${fai} ; ap[2,3]
flddx $idx($np),${fni} ; np[2,3]
___
$code.=<<___ if ($BN_SZ==4);
mtctl $hi0,%cr11 ; $hi0 still holds 31
extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
b L\$parisc11
nop
___
$code.=<<___; # PA-RISC 2.0 code-path
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
extrd,u $ab0,31,32,$hi0
extrd,u $ab0,63,32,$ab0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
ldo 8($idx),$idx ; j++++
addl $ab0,$nm0,$nm0 ; low part is discarded
extrd,u $nm0,31,32,$hi1
L\$1st
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,63,32,$ab1
addl $hi1,$nm1,$nm1
flddx $idx($ap),${fai} ; ap[j,j+1]
flddx $idx($np),${fni} ; np[j,j+1]
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
addl $hi0,$ab0,$ab0
extrd,u $ab0,31,32,$hi0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
stw $nm1,-4($tp) ; tp[j-1]
addl $ab0,$nm0,$nm0
stw,ma $nm0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$1st ; j++++
extrd,u $nm0,31,32,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,63,32,$ab1
addl $hi1,$nm1,$nm1
ldd -16($xfer),$ab0
addl $ab1,$nm1,$nm1
ldd -8($xfer),$nm0
extrd,u $nm1,31,32,$hi1
addl $hi0,$ab0,$ab0
extrd,u $ab0,31,32,$hi0
stw $nm1,-4($tp) ; tp[j-1]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
ldd 0($xfer),$ab1
addl $ab0,$nm0,$nm0
ldd,mb 8($xfer),$nm1
extrd,u $nm0,31,32,$hi1
stw,ma $nm0,8($tp) ; tp[j-1]
ldo -1($num),$num ; i--
subi 0,$arrsz,$idx ; j=0
___
$code.=<<___ if ($BN_SZ==4);
fldws,ma 4($bp),${fbi} ; bp[1]
___
$code.=<<___ if ($BN_SZ==8);
fldws 0($bp),${fbi} ; bp[1] in flipped word order
___
$code.=<<___;
flddx $idx($ap),${fai} ; ap[0,1]
flddx $idx($np),${fni} ; np[0,1]
fldws 8($xfer),${fti}R ; tp[0]
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
stw $nm1,-4($tp) ; tp[j-1]
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
addl $hi1,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
stw $hi0,0($tp)
stw $hi1,4($tp)
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
xmpyu ${fn0},${fab0}R,${fm0}
ldo `$LOCALS+32+4`($fp),$tp
L\$outer
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer) ; 33-bit value
fstds ${fnm0},-8($xfer)
flddx $idx($ap),${fai} ; ap[2]
flddx $idx($np),${fni} ; np[2]
ldo 8($idx),$idx ; j++++
ldd -16($xfer),$ab0 ; 33-bit value
ldd -8($xfer),$nm0
ldw 0($xfer),$hi0 ; high part
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
extrd,u $ab0,31,32,$ti0 ; carry bit
extrd,u $ab0,63,32,$ab0
fstds ${fab1},0($xfer)
addl $ti0,$hi0,$hi0 ; account carry bit
fstds ${fnm1},8($xfer)
addl $ab0,$nm0,$nm0 ; low part is discarded
ldw 0($tp),$ti1 ; tp[1]
extrd,u $nm0,31,32,$hi1
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
L\$inner
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ti1,$ti1
addl $ti1,$ab1,$ab1
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
flddx $idx($ap),${fai} ; ap[j,j+1]
flddx $idx($np),${fni} ; np[j,j+1]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
ldw 4($tp),$ti0 ; tp[j]
stw $nm1,-4($tp) ; tp[j-1]
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
addl $hi0,$ti0,$ti0
addl $ti0,$ab0,$ab0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
extrd,u $ab0,31,32,$hi0
extrd,u $nm1,31,32,$hi1
ldw 8($tp),$ti1 ; tp[j]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
addl $ab0,$nm0,$nm0
stw,ma $nm0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$inner ; j++++
extrd,u $nm0,31,32,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ti1,$ti1
addl $ti1,$ab1,$ab1
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
ldd -16($xfer),$ab0
ldd -8($xfer),$nm0
extrd,u $nm1,31,32,$hi1
addl $hi0,$ab0,$ab0
addl $ti0,$ab0,$ab0
stw $nm1,-4($tp) ; tp[j-1]
extrd,u $ab0,31,32,$hi0
ldw 8($tp),$ti1 ; tp[j]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
ldd 0($xfer),$ab1
addl $ab0,$nm0,$nm0
ldd,mb 8($xfer),$nm1
extrd,u $nm0,31,32,$hi1
stw,ma $nm0,8($tp) ; tp[j-1]
addib,= -1,$num,L\$outerdone ; i--
subi 0,$arrsz,$idx ; j=0
___
$code.=<<___ if ($BN_SZ==4);
fldws,ma 4($bp),${fbi} ; bp[i]
___
$code.=<<___ if ($BN_SZ==8);
ldi 12,$ti0 ; bp[i] in flipped word order
addl,ev %r0,$num,$num
ldi -4,$ti0
addl $ti0,$bp,$bp
fldws 0($bp),${fbi}
___
$code.=<<___;
flddx $idx($ap),${fai} ; ap[0]
addl $hi0,$ab1,$ab1
flddx $idx($np),${fni} ; np[0]
fldws 8($xfer),${fti}R ; tp[0]
addl $ti1,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
fstws,mb ${fab0}L,-8($xfer) ; save high part
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
stw $nm1,-4($tp) ; tp[j-1]
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
addl $hi1,$hi0,$hi0
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
addl $ti0,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
stw $hi0,0($tp)
stw $hi1,4($tp)
xmpyu ${fn0},${fab0}R,${fm0}
b L\$outer
ldo `$LOCALS+32+4`($fp),$tp
L\$outerdone
addl $hi0,$ab1,$ab1
addl $ti1,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
stw $nm1,-4($tp) ; tp[j-1]
addl $hi1,$hi0,$hi0
addl $ti0,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
stw $hi0,0($tp)
stw $hi1,4($tp)
ldo `$LOCALS+32`($fp),$tp
sub %r0,%r0,%r0 ; clear borrow
___
$code.=<<___ if ($BN_SZ==4);
ldws,ma 4($tp),$ti0
extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
b L\$sub_pa11
addl $tp,$arrsz,$tp
L\$sub
ldwx $idx($np),$hi0
subb $ti0,$hi0,$hi1
ldwx $idx($tp),$ti0
addib,<> 4,$idx,L\$sub
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
ldo -4($tp),$tp
___
$code.=<<___ if ($BN_SZ==8);
ldd,ma 8($tp),$ti0
L\$sub
ldd $idx($np),$hi0
shrpd $ti0,$ti0,32,$ti0 ; flip word order
std $ti0,-8($tp) ; save flipped value
sub,db $ti0,$hi0,$hi1
ldd,ma 8($tp),$ti0
addib,<> 8,$idx,L\$sub
std,ma $hi1,8($rp)
extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
sub,db $ti0,%r0,$hi1
ldo -8($tp),$tp
___
$code.=<<___;
and $tp,$hi1,$ap
andcm $rp,$hi1,$bp
or $ap,$bp,$np
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
ldo `$LOCALS+32`($fp),$tp
L\$copy
ldd $idx($np),$hi0
std,ma %r0,8($tp)
addib,<> 8,$idx,.-8 ; L\$copy
std,ma $hi0,8($rp)
___
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
$ablo=$ab0;
$abhi=$ab1;
$nmlo0=$nm0;
$nmhi0=$nm1;
$nmlo1="%r9";
$nmhi1="%r8";
$code.=<<___;
b L\$done
nop
.ALIGN 8
L\$parisc11
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -12($xfer),$ablo
ldw -16($xfer),$hi0
ldw -4($xfer),$nmlo0
ldw -8($xfer),$nmhi0
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
ldo 8($idx),$idx ; j++++
add $ablo,$nmlo0,$nmlo0 ; discarded
addc %r0,$nmhi0,$hi1
ldw 4($xfer),$ablo
ldw 0($xfer),$abhi
nop
L\$1st_pa11
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
flddx $idx($ap),${fai} ; ap[j,j+1]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
flddx $idx($np),${fni} ; np[j,j+1]
add $hi0,$ablo,$ablo
ldw 12($xfer),$nmlo1
addc %r0,$abhi,$hi0
ldw 8($xfer),$nmhi1
add $ablo,$nmlo1,$nmlo1
fstds ${fab1},0($xfer)
addc %r0,$nmhi1,$nmhi1
fstds ${fnm1},8($xfer)
add $hi1,$nmlo1,$nmlo1
ldw -12($xfer),$ablo
addc %r0,$nmhi1,$hi1
ldw -16($xfer),$abhi
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
ldw -4($xfer),$nmlo0
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -8($xfer),$nmhi0
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$hi0
fstds ${fab0},-16($xfer)
add $ablo,$nmlo0,$nmlo0
fstds ${fnm0},-8($xfer)
addc %r0,$nmhi0,$nmhi0
ldw 0($xfer),$abhi
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$1st_pa11 ; j++++
addc %r0,$nmhi0,$hi1
ldw 8($xfer),$nmhi1
ldw 12($xfer),$nmlo1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
add $hi0,$ablo,$ablo
fstds ${fab1},0($xfer)
addc %r0,$abhi,$hi0
fstds ${fnm1},8($xfer)
add $ablo,$nmlo1,$nmlo1
ldw -16($xfer),$abhi
addc %r0,$nmhi1,$nmhi1
ldw -12($xfer),$ablo
add $hi1,$nmlo1,$nmlo1
ldw -8($xfer),$nmhi0
addc %r0,$nmhi1,$hi1
ldw -4($xfer),$nmlo0
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$hi0
ldw 0($xfer),$abhi
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldws,mb 8($xfer),$nmhi1
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$nmlo1
addc %r0,$nmhi0,$hi1
stws,ma $nmlo0,8($tp) ; tp[j-1]
ldo -1($num),$num ; i--
subi 0,$arrsz,$idx ; j=0
fldws,ma 4($bp),${fbi} ; bp[1]
flddx $idx($ap),${fai} ; ap[0,1]
flddx $idx($np),${fni} ; np[0,1]
fldws 8($xfer),${fti}R ; tp[0]
add $hi0,$ablo,$ablo
addc %r0,$abhi,$hi0
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
stw $nmlo1,-4($tp) ; tp[j-1]
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
stw $hi0,0($tp)
stw $hi1,4($tp)
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
xmpyu ${fn0},${fab0}R,${fm0}
ldo `$LOCALS+32+4`($fp),$tp
L\$outer_pa11
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer) ; 33-bit value
fstds ${fnm0},-8($xfer)
flddx $idx($ap),${fai} ; ap[2,3]
flddx $idx($np),${fni} ; np[2,3]
ldw -16($xfer),$abhi ; carry bit actually
ldo 8($idx),$idx ; j++++
ldw -12($xfer),$ablo
ldw -8($xfer),$nmhi0
ldw -4($xfer),$nmlo0
ldw 0($xfer),$hi0 ; high part
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
fstds ${fab1},0($xfer)
addl $abhi,$hi0,$hi0 ; account carry bit
fstds ${fnm1},8($xfer)
add $ablo,$nmlo0,$nmlo0 ; discarded
ldw 0($tp),$ti1 ; tp[1]
addc %r0,$nmhi0,$hi1
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
ldw 4($xfer),$ablo
ldw 0($xfer),$abhi
L\$inner_pa11
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
flddx $idx($ap),${fai} ; ap[j,j+1]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
flddx $idx($np),${fni} ; np[j,j+1]
add $hi0,$ablo,$ablo
ldw 4($tp),$ti0 ; tp[j]
addc %r0,$abhi,$abhi
ldw 12($xfer),$nmlo1
add $ti1,$ablo,$ablo
ldw 8($xfer),$nmhi1
addc %r0,$abhi,$hi0
fstds ${fab1},0($xfer)
add $ablo,$nmlo1,$nmlo1
fstds ${fnm1},8($xfer)
addc %r0,$nmhi1,$nmhi1
ldw -12($xfer),$ablo
add $hi1,$nmlo1,$nmlo1
ldw -16($xfer),$abhi
addc %r0,$nmhi1,$hi1
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
ldw 8($tp),$ti1 ; tp[j]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -4($xfer),$nmlo0
add $hi0,$ablo,$ablo
ldw -8($xfer),$nmhi0
addc %r0,$abhi,$abhi
stw $nmlo1,-4($tp) ; tp[j-1]
add $ti0,$ablo,$ablo
fstds ${fab0},-16($xfer)
addc %r0,$abhi,$hi0
fstds ${fnm0},-8($xfer)
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldw 0($xfer),$abhi
add $hi1,$nmlo0,$nmlo0
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$inner_pa11 ; j++++
addc %r0,$nmhi0,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
ldw 12($xfer),$nmlo1
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldw 8($xfer),$nmhi1
add $hi0,$ablo,$ablo
ldw 4($tp),$ti0 ; tp[j]
addc %r0,$abhi,$abhi
fstds ${fab1},0($xfer)
add $ti1,$ablo,$ablo
fstds ${fnm1},8($xfer)
addc %r0,$abhi,$hi0
ldw -16($xfer),$abhi
add $ablo,$nmlo1,$nmlo1
ldw -12($xfer),$ablo
addc %r0,$nmhi1,$nmhi1
ldw -8($xfer),$nmhi0
add $hi1,$nmlo1,$nmlo1
ldw -4($xfer),$nmlo0
addc %r0,$nmhi1,$hi1
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$abhi
add $ti0,$ablo,$ablo
ldw 8($tp),$ti1 ; tp[j]
addc %r0,$abhi,$hi0
ldw 0($xfer),$abhi
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldws,mb 8($xfer),$nmhi1
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$nmlo1
addc %r0,$nmhi0,$hi1
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,= -1,$num,L\$outerdone_pa11; i--
subi 0,$arrsz,$idx ; j=0
fldws,ma 4($bp),${fbi} ; bp[i]
flddx $idx($ap),${fai} ; ap[0]
add $hi0,$ablo,$ablo
addc %r0,$abhi,$abhi
flddx $idx($np),${fni} ; np[0]
fldws 8($xfer),${fti}R ; tp[0]
add $ti1,$ablo,$ablo
addc %r0,$abhi,$hi0
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
ldw 4($tp),$ti0 ; tp[j]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
stw $nmlo1,-4($tp) ; tp[j-1]
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
add $ti0,$hi0,$hi0
addc %r0,$hi1,$hi1
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
stw $hi0,0($tp)
stw $hi1,4($tp)
xmpyu ${fn0},${fab0}R,${fm0}
b L\$outer_pa11
ldo `$LOCALS+32+4`($fp),$tp
L\$outerdone_pa11
add $hi0,$ablo,$ablo
addc %r0,$abhi,$abhi
add $ti1,$ablo,$ablo
addc %r0,$abhi,$hi0
ldw 4($tp),$ti0 ; tp[j]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
stw $nmlo1,-4($tp) ; tp[j-1]
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
add $ti0,$hi0,$hi0
addc %r0,$hi1,$hi1
stw $hi0,0($tp)
stw $hi1,4($tp)
ldo `$LOCALS+32+4`($fp),$tp
sub %r0,%r0,%r0 ; clear borrow
ldw -4($tp),$ti0
addl $tp,$arrsz,$tp
L\$sub_pa11
ldwx $idx($np),$hi0
subb $ti0,$hi0,$hi1
ldwx $idx($tp),$ti0
addib,<> 4,$idx,L\$sub_pa11
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
ldo -4($tp),$tp
and $tp,$hi1,$ap
andcm $rp,$hi1,$bp
or $ap,$bp,$np
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
ldo `$LOCALS+32`($fp),$tp
L\$copy_pa11
ldwx $idx($np),$hi0
stws,ma %r0,4($tp)
addib,<> 4,$idx,L\$copy_pa11
stws,ma $hi0,4($rp)
nop ; alignment
L\$done
___
}
$code.=<<___;
ldi 1,%r28 ; signal "handled"
ldo $FRAME($fp),%sp ; destroy tp[num+1]
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
L\$abort
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
{ my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $sub = sub {
my ($mod,$args) = @_;
my $orig = "sub$mod\t$args";
if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
$opcode|=(1<<10); # e1
$opcode|=(1<<8); # e2
$opcode|=(1<<5); # d
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
# flip word order in 64-bit mode...
s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
# assemble 2.0 instructions in 32-bit mode...
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
s/\bbv\b/bve/gm if ($SIZE_T==8);
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,335 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2006
# "Teaser" Montgomery multiplication module for PowerPC. It's possible
# to gain a bit more by modulo-scheduling outer loop, then dedicated
# squaring procedure should give further 20% and code can be adapted
# for 32-bit application running on 64-bit CPU. As for the latter.
# It won't be able to achieve "native" 64-bit performance, because in
# 32-bit application context every addc instruction will have to be
# expanded as addc, twice right shift by 32 and finally adde, etc.
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
# for 64-bit application running on PPC970/G5 is:
#
# 512-bit +65%
# 1024-bit +35%
# 2048-bit +18%
# 4096-bit +4%
$flavour = shift;
if ($flavour =~ /32/) {
$BITS= 32;
$BNSZ= $BITS/8;
$SIZE_T=4;
$RZONE= 224;
$LD= "lwz"; # load
$LDU= "lwzu"; # load and update
$LDX= "lwzx"; # load indexed
$ST= "stw"; # store
$STU= "stwu"; # store and update
$STX= "stwx"; # store indexed
$STUX= "stwux"; # store indexed and update
$UMULL= "mullw"; # unsigned multiply low
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
$SHRI= "srwi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} elsif ($flavour =~ /64/) {
$BITS= 64;
$BNSZ= $BITS/8;
$SIZE_T=8;
$RZONE= 288;
# same as above, but 64-bit mnemonics...
$LD= "ld"; # load
$LDU= "ldu"; # load and update
$LDX= "ldx"; # load indexed
$ST= "std"; # store
$STU= "stdu"; # store and update
$STX= "stdx"; # store indexed
$STUX= "stdux"; # store indexed and update
$UMULL= "mulld"; # unsigned multiply low
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
$SHRI= "srdi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} else { die "nonsense $flavour"; }
$FRAME=8*$SIZE_T+$RZONE;
$LOCALS=8*$SIZE_T;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$sp="r1";
$toc="r2";
$rp="r3"; $ovf="r3";
$ap="r4";
$bp="r5";
$np="r6";
$n0="r7";
$num="r8";
$rp="r9"; # $rp is reassigned
$aj="r10";
$nj="r11";
$tj="r12";
# non-volatile registers
$i="r20";
$j="r21";
$tp="r22";
$m0="r23";
$m1="r24";
$lo0="r25";
$hi0="r26";
$lo1="r27";
$hi1="r28";
$alo="r29";
$ahi="r30";
$nlo="r31";
#
$nhi="r0";
$code=<<___;
.machine "any"
.text
.globl .bn_mul_mont_int
.align 4
.bn_mul_mont_int:
cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
li r3,0
bltlr
___
$code.=<<___ if ($BNSZ==4);
cmpwi $num,32 ; longer key performance is not better
bgelr
___
$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096
addi $ovf,$num,$FRAME
subf $ovf,$ovf,$sp ; $sp-$ovf
and $ovf,$ovf,$tj ; minimize TLB usage
subf $ovf,$sp,$ovf ; $ovf-$sp
mr $tj,$sp
srwi $num,$num,`log($BNSZ)/log(2)`
$STUX $sp,$sp,$ovf
$PUSH r20,`-12*$SIZE_T`($tj)
$PUSH r21,`-11*$SIZE_T`($tj)
$PUSH r22,`-10*$SIZE_T`($tj)
$PUSH r23,`-9*$SIZE_T`($tj)
$PUSH r24,`-8*$SIZE_T`($tj)
$PUSH r25,`-7*$SIZE_T`($tj)
$PUSH r26,`-6*$SIZE_T`($tj)
$PUSH r27,`-5*$SIZE_T`($tj)
$PUSH r28,`-4*$SIZE_T`($tj)
$PUSH r29,`-3*$SIZE_T`($tj)
$PUSH r30,`-2*$SIZE_T`($tj)
$PUSH r31,`-1*$SIZE_T`($tj)
$LD $n0,0($n0) ; pull n0[0] value
addi $num,$num,-2 ; adjust $num for counter register
$LD $m0,0($bp) ; m0=bp[0]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$LOCALS
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
$LD $nj,0($np) ; np[0]
$UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
$UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
$UMULH $ahi,$aj,$m0
$UMULL $lo1,$nj,$m1 ; np[0]*m1
$UMULH $hi1,$nj,$m1
$LD $nj,$BNSZ($np) ; np[1]
addc $lo1,$lo1,$lo0
addze $hi1,$hi1
$UMULL $nlo,$nj,$m1 ; np[1]*m1
$UMULH $nhi,$nj,$m1
mtctr $num
li $j,`2*$BNSZ`
.align 4
L1st:
$LDX $aj,$ap,$j ; ap[j]
addc $lo0,$alo,$hi0
$LDX $nj,$np,$j ; np[j]
addze $hi0,$ahi
$UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
addc $lo1,$nlo,$hi1
$UMULH $ahi,$aj,$m0
addze $hi1,$nhi
$UMULL $nlo,$nj,$m1 ; np[j]*m1
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
$UMULH $nhi,$nj,$m1
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
addi $j,$j,$BNSZ ; j++
addi $tp,$tp,$BNSZ ; tp++
bdnz- L1st
;L1st
addc $lo0,$alo,$hi0
addze $hi0,$ahi
addc $lo1,$nlo,$hi1
addze $hi1,$nhi
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
li $ovf,0
addc $hi1,$hi1,$hi0
addze $ovf,$ovf ; upmost overflow bit
$ST $hi1,$BNSZ($tp)
li $i,$BNSZ
.align 4
Louter:
$LDX $m0,$bp,$i ; m0=bp[i]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$LOCALS
$LD $tj,$LOCALS($sp); tp[0]
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
$LD $nj,0($np) ; np[0]
addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
addze $hi0,$hi0
$UMULL $m1,$lo0,$n0 ; tp[0]*n0
$UMULH $ahi,$aj,$m0
$UMULL $lo1,$nj,$m1 ; np[0]*m1
$UMULH $hi1,$nj,$m1
$LD $nj,$BNSZ($np) ; np[1]
addc $lo1,$lo1,$lo0
$UMULL $nlo,$nj,$m1 ; np[1]*m1
addze $hi1,$hi1
$UMULH $nhi,$nj,$m1
mtctr $num
li $j,`2*$BNSZ`
.align 4
Linner:
$LDX $aj,$ap,$j ; ap[j]
addc $lo0,$alo,$hi0
$LD $tj,$BNSZ($tp) ; tp[j]
addze $hi0,$ahi
$LDX $nj,$np,$j ; np[j]
addc $lo1,$nlo,$hi1
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
addze $hi1,$nhi
$UMULH $ahi,$aj,$m0
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
$UMULL $nlo,$nj,$m1 ; np[j]*m1
addze $hi0,$hi0
$UMULH $nhi,$nj,$m1
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
addi $j,$j,$BNSZ ; j++
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
addi $tp,$tp,$BNSZ ; tp++
bdnz- Linner
;Linner
$LD $tj,$BNSZ($tp) ; tp[j]
addc $lo0,$alo,$hi0
addze $hi0,$ahi
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
addze $hi0,$hi0
addc $lo1,$nlo,$hi1
addze $hi1,$nhi
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
li $ovf,0
adde $hi1,$hi1,$hi0
addze $ovf,$ovf
$ST $hi1,$BNSZ($tp)
;
slwi $tj,$num,`log($BNSZ)/log(2)`
$UCMP $i,$tj
addi $i,$i,$BNSZ
ble- Louter
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$LOCALS
mtctr $num
.align 4
Lsub: $LDX $tj,$tp,$j
$LDX $nj,$np,$j
subfe $aj,$nj,$tj ; tp[j]-np[j]
$STX $aj,$rp,$j
addi $j,$j,$BNSZ
bdnz- Lsub
li $j,0
mtctr $num
subfe $ovf,$j,$ovf ; handle upmost overflow bit
and $ap,$tp,$ovf
andc $np,$rp,$ovf
or $ap,$ap,$np ; ap=borrow?tp:rp
.align 4
Lcopy: ; copy or in-place refresh
$LDX $tj,$ap,$j
$STX $tj,$rp,$j
$STX $j,$tp,$j ; zap at once
addi $j,$j,$BNSZ
bdnz- Lcopy
$POP $tj,0($sp)
li r3,1
$POP r20,`-12*$SIZE_T`($tj)
$POP r21,`-11*$SIZE_T`($tj)
$POP r22,`-10*$SIZE_T`($tj)
$POP r23,`-9*$SIZE_T`($tj)
$POP r24,`-8*$SIZE_T`($tj)
$POP r25,`-7*$SIZE_T`($tj)
$POP r26,`-6*$SIZE_T`($tj)
$POP r27,`-5*$SIZE_T`($tj)
$POP r28,`-4*$SIZE_T`($tj)
$POP r29,`-3*$SIZE_T`($tj)
$POP r30,`-2*$SIZE_T`($tj)
$POP r31,`-1*$SIZE_T`($tj)
mr $sp,$tj
blr
.long 0
.byte 0,12,4,0,0x80,12,6,0
.long 0
.size .bn_mul_mont_int,.-.bn_mul_mont_int
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,221 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... gcc 4.3 appeared to generate poor code, therefore
# the effort. And indeed, the module delivers 55%-90%(*) improvement
# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
# This is for 64-bit build. In 32-bit "highgprs" case improvement is
# even higher, for example on z990 it was measured 80%-150%. ECDSA
# sign is modest 9%-12% faster. Keep in mind that these coefficients
# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
# burnt in it...
#
# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
# so that improvement coefficients can vary from one specific
# setup to another.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$stdframe=16*$SIZE_T+4*8;
$rp="%r2";
$a1="%r3";
$a0="%r4";
$b1="%r5";
$b0="%r6";
$ra="%r14";
$sp="%r15";
@T=("%r0","%r1");
@i=("%r12","%r13");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
$code.=<<___;
.text
.type _mul_1x1,\@function
.align 16
_mul_1x1:
lgr $a1,$a
sllg $a2,$a,1
sllg $a4,$a,2
sllg $a8,$a,3
srag $lo,$a1,63 # broadcast 63rd bit
nihh $a1,0x1fff
srag @i[0],$a2,63 # broadcast 62nd bit
nihh $a2,0x3fff
srag @i[1],$a4,63 # broadcast 61st bit
nihh $a4,0x7fff
ngr $lo,$b
ngr @i[0],$b
ngr @i[1],$b
lghi @T[0],0
lgr $a12,$a1
stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
xgr $a12,$a2
stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
lgr $a48,$a4
stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
xgr $a48,$a8
stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
xgr $a1,$a4
stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
xgr $a2,$a4
stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
xgr $a12,$a4
stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
xgr $a1,$a48
stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
xgr $a2,$a48
stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
xgr $a12,$a48
stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
xgr $a1,$a4
stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
xgr $a2,$a4
stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
xgr $a12,$a4
stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
srlg $hi,$lo,1
stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
sllg $lo,$lo,63
stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
srlg @T[0],@i[0],2
stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
lghi $mask,`0xf<<3`
sllg $a1,@i[0],62
sllg @i[0],$b,3
srlg @T[1],@i[1],3
ngr @i[0],$mask
sllg $a2,@i[1],61
srlg @i[1],$b,4-3
xgr $hi,@T[0]
ngr @i[1],$mask
xgr $lo,$a1
xgr $hi,@T[1]
xgr $lo,$a2
xg $lo,$stdframe(@i[0],$sp)
srlg @i[0],$b,8-3
ngr @i[0],$mask
___
for($n=1;$n<14;$n++) {
$code.=<<___;
lg @T[1],$stdframe(@i[1],$sp)
srlg @i[1],$b,`($n+2)*4`-3
sllg @T[0],@T[1],`$n*4`
ngr @i[1],$mask
srlg @T[1],@T[1],`64-$n*4`
xgr $lo,@T[0]
xgr $hi,@T[1]
___
push(@i,shift(@i)); push(@T,shift(@T));
}
$code.=<<___;
lg @T[1],$stdframe(@i[1],$sp)
sllg @T[0],@T[1],`$n*4`
srlg @T[1],@T[1],`64-$n*4`
xgr $lo,@T[0]
xgr $hi,@T[1]
lg @T[0],$stdframe(@i[0],$sp)
sllg @T[1],@T[0],`($n+1)*4`
srlg @T[0],@T[0],`64-($n+1)*4`
xgr $lo,@T[1]
xgr $hi,@T[0]
br $ra
.size _mul_1x1,.-_mul_1x1
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,\@function
.align 16
bn_GF2m_mul_2x2:
stm${g} %r3,%r15,3*$SIZE_T($sp)
lghi %r1,-$stdframe-128
la %r0,0($sp)
la $sp,0(%r1,$sp) # alloca
st${g} %r0,0($sp) # back chain
___
if ($SIZE_T==8) {
my @r=map("%r$_",(6..9));
$code.=<<___;
bras $ra,_mul_1x1 # a1·b1
stmg $lo,$hi,16($rp)
lg $a,`$stdframe+128+4*$SIZE_T`($sp)
lg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # a0·b0
stmg $lo,$hi,0($rp)
lg $a,`$stdframe+128+3*$SIZE_T`($sp)
lg $b,`$stdframe+128+5*$SIZE_T`($sp)
xg $a,`$stdframe+128+4*$SIZE_T`($sp)
xg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
lmg @r[0],@r[3],0($rp)
xgr $lo,$hi
xgr $hi,@r[1]
xgr $lo,@r[0]
xgr $hi,@r[2]
xgr $lo,@r[3]
xgr $hi,@r[3]
xgr $lo,$hi
stg $hi,16($rp)
stg $lo,8($rp)
___
} else {
$code.=<<___;
sllg %r3,%r3,32
sllg %r5,%r5,32
or %r3,%r4
or %r5,%r6
bras $ra,_mul_1x1
rllg $lo,$lo,32
rllg $hi,$hi,32
stmg $lo,$hi,0($rp)
___
}
$code.=<<___;
lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
br $ra
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,277 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2007.
#
# Performance improvement over vanilla C code varies from 85% to 45%
# depending on key length and benchmark. Unfortunately in this context
# these are not very impressive results [for code that utilizes "wide"
# 64x64=128-bit multiplication, which is not commonly available to C
# programmers], at least hand-coded bn_asm.c replacement is known to
# provide 30-40% better results for longest keys. Well, on a second
# thought it's not very surprising, because z-CPUs are single-issue
# and _strictly_ in-order execution, while bn_mul_mont is more or less
# dependent on CPU ability to pipe-line instructions and have several
# of them "in-flight" at the same time. I mean while other methods,
# for example Karatsuba, aim to minimize amount of multiplications at
# the cost of other operations increase, bn_mul_mont aim to neatly
# "overlap" multiplications and the other operations [and on most
# platforms even minimize the amount of the other operations, in
# particular references to memory]. But it's possible to improve this
# module performance by implementing dedicated squaring code-path and
# possibly by unrolling loops...
# January 2009.
#
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
# is achieved by swapping words after 64-bit loads, follow _dswap-s.
# On z990 it was measured to perform 2.6-2.2 times better than
# compiler-generated code, less for longer keys...
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$stdframe=16*$SIZE_T+4*8;
$mn0="%r0";
$num="%r1";
# int bn_mul_mont(
$rp="%r2"; # BN_ULONG *rp,
$ap="%r3"; # const BN_ULONG *ap,
$bp="%r4"; # const BN_ULONG *bp,
$np="%r5"; # const BN_ULONG *np,
$n0="%r6"; # const BN_ULONG *n0,
#$num="160(%r15)" # int num);
$bi="%r2"; # zaps rp
$j="%r7";
$ahi="%r8";
$alo="%r9";
$nhi="%r10";
$nlo="%r11";
$AHI="%r12";
$NHI="%r13";
$count="%r14";
$sp="%r15";
$code.=<<___;
.text
.globl bn_mul_mont
.type bn_mul_mont,\@function
bn_mul_mont:
lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
la $bp,0($num,$bp)
st${g} %r2,2*$SIZE_T($sp)
cghi $num,16 #
lghi %r2,0 #
blr %r14 # if($num<16) return 0;
___
$code.=<<___ if ($flavour =~ /3[12]/);
tmll $num,4
bnzr %r14 # if ($num&1) return 0;
___
$code.=<<___ if ($flavour !~ /3[12]/);
cghi $num,96 #
bhr %r14 # if($num>96) return 0;
___
$code.=<<___;
stm${g} %r3,%r15,3*$SIZE_T($sp)
lghi $rp,-$stdframe-8 # leave room for carry bit
lcgr $j,$num # -$num
lgr %r0,$sp
la $rp,0($rp,$sp)
la $sp,0($j,$rp) # alloca
st${g} %r0,0($sp) # back chain
sra $num,3 # restore $num
la $bp,0($j,$bp) # restore $bp
ahi $num,-1 # adjust $num for inner loop
lg $n0,0($n0) # pull n0
_dswap $n0
lg $bi,0($bp)
_dswap $bi
lg $alo,0($ap)
_dswap $alo
mlgr $ahi,$bi # ap[0]*bp[0]
lgr $AHI,$ahi
lgr $mn0,$alo # "tp[0]"*n0
msgr $mn0,$n0
lg $nlo,0($np) #
_dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.L1st:
lg $alo,0($j,$ap)
_dswap $alo
mlgr $ahi,$bi # ap[j]*bp[0]
algr $alo,$AHI
lghi $AHI,0
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
_dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
alcgr $nhi,$NHI # +="tp[j]"
algr $nlo,$alo
alcgr $NHI,$nhi
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.L1st
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI # upmost overflow bit
stg $NHI,$stdframe-8($j,$sp)
stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
.Louter:
lg $bi,0($bp) # bp[i]
_dswap $bi
lg $alo,0($ap)
_dswap $alo
mlgr $ahi,$bi # ap[0]*bp[i]
alg $alo,$stdframe($sp) # +=tp[0]
lghi $AHI,0
alcgr $AHI,$ahi
lgr $mn0,$alo
msgr $mn0,$n0 # tp[0]*n0
lg $nlo,0($np) # np[0]
_dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.Linner:
lg $alo,0($j,$ap)
_dswap $alo
mlgr $ahi,$bi # ap[j]*bp[i]
algr $alo,$AHI
lghi $AHI,0
alcgr $ahi,$AHI
alg $alo,$stdframe($j,$sp)# +=tp[j]
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
_dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
alcgr $nhi,$NHI
algr $nlo,$alo # +="tp[j]"
alcgr $NHI,$nhi
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.Linner
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI
alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
lghi $ahi,0
alcgr $AHI,$ahi # new upmost overflow bit
stg $NHI,$stdframe-8($j,$sp)
stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
jne .Louter
l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
la $ap,$stdframe($sp)
ahi $num,1 # restore $num, incidentally clears "borrow"
la $j,0(%r0)
lr $count,$num
.Lsub: lg $alo,0($j,$ap)
lg $nlo,0($j,$np)
_dswap $nlo
slbgr $alo,$nlo
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lsub
lghi $ahi,0
slbgr $AHI,$ahi # handle upmost carry
ngr $ap,$AHI
lghi $np,-1
xgr $np,$AHI
ngr $np,$rp
ogr $ap,$np # ap=borrow?tp:rp
la $j,0(%r0)
lgr $count,$num
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
_dswap $alo
stg $j,$stdframe($j,$sp) # zap tp
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lcopy
la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
lm${g} %r6,%r15,0(%r1)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,713 @@
.ident "s390x.S, version 1.1"
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
// granted according to the OpenSSL license. Warranty of any kind is
// disclaimed.
// ====================================================================
.text
#define zero %r0
// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
.globl bn_mul_add_words
.type bn_mul_add_words,@function
.align 4
bn_mul_add_words:
lghi zero,0 // zero = 0
la %r1,0(%r2) // put rp aside [to give way to]
lghi %r2,0 // return value
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
stmg %r6,%r13,48(%r15)
lghi %r2,3
lghi %r12,0 // carry = 0
slgr %r1,%r3 // rp-=ap
nr %r2,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_madd // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
lg %r7,0(%r3) // ap[0]
lg %r9,8(%r3) // ap[1]
mlgr %r6,%r5 // *=w
brct %r4,.Loop4_madd
j .Loop4_madd_tail
.Loop4_madd:
mlgr %r8,%r5
lg %r11,16(%r3) // ap[i+2]
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
mlgr %r10,%r5
lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,8(%r3,%r1)
stg %r9,8(%r3,%r1)
mlgr %r12,%r5
lg %r7,32(%r3)
alcgr %r11,%r8
alcgr %r10,zero
alg %r11,16(%r3,%r1)
stg %r11,16(%r3,%r1)
mlgr %r6,%r5
lg %r9,40(%r3)
alcgr %r13,%r10
alcgr %r12,zero
alg %r13,24(%r3,%r1)
stg %r13,24(%r3,%r1)
la %r3,32(%r3) // i+=4
brct %r4,.Loop4_madd
.Loop4_madd_tail:
mlgr %r8,%r5
lg %r11,16(%r3)
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
mlgr %r10,%r5
lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,8(%r3,%r1)
stg %r9,8(%r3,%r1)
mlgr %r12,%r5
alcgr %r11,%r8
alcgr %r10,zero
alg %r11,16(%r3,%r1)
stg %r11,16(%r3,%r1)
alcgr %r13,%r10
alcgr %r12,zero
alg %r13,24(%r3,%r1)
stg %r13,24(%r3,%r1)
la %r3,32(%r3) // i+=4
la %r2,1(%r2) // see if len%4 is zero ...
brct %r2,.Loop1_madd // without touching condition code:-)
.Lend_madd:
lgr %r2,zero // return value
alcgr %r2,%r12 // collect even carry bit
lmg %r6,%r13,48(%r15)
br %r14
.Loop1_madd:
lg %r7,0(%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
lgr %r12,%r6
la %r3,8(%r3) // i++
brct %r2,.Loop1_madd
j .Lend_madd
.size bn_mul_add_words,.-bn_mul_add_words
// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
.globl bn_mul_words
.type bn_mul_words,@function
.align 4
bn_mul_words:
lghi zero,0 // zero = 0
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0;
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
stmg %r6,%r10,48(%r15)
lghi %r10,3
lghi %r8,0 // carry = 0
nr %r10,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_mul // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
.Loop4_mul:
lg %r7,0(%r2,%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
stg %r7,0(%r2,%r1) // rp[i]=
lg %r9,8(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
stg %r9,8(%r2,%r1)
lg %r7,16(%r2,%r3)
mlgr %r6,%r5
alcgr %r7,%r8
stg %r7,16(%r2,%r1)
lg %r9,24(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
stg %r9,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r4,.Loop4_mul
la %r10,1(%r10) // see if len%4 is zero ...
brct %r10,.Loop1_mul // without touching condition code:-)
.Lend_mul:
alcgr %r8,zero // collect carry bit
lgr %r2,%r8
lmg %r6,%r10,48(%r15)
br %r14
.Loop1_mul:
lg %r7,0(%r2,%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
stg %r7,0(%r2,%r1) // rp[i]=
lgr %r8,%r6
la %r2,8(%r2) // i++
brct %r10,.Loop1_mul
j .Lend_mul
.size bn_mul_words,.-bn_mul_words
// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
.globl bn_sqr_words
.type bn_sqr_words,@function
.align 4
bn_sqr_words:
ltgfr %r4,%r4
bler %r14
stmg %r6,%r7,48(%r15)
srag %r1,%r4,2 // cnt=len/4
jz .Loop1_sqr
.Loop4_sqr:
lg %r7,0(%r3)
mlgr %r6,%r7
stg %r7,0(%r2)
stg %r6,8(%r2)
lg %r7,8(%r3)
mlgr %r6,%r7
stg %r7,16(%r2)
stg %r6,24(%r2)
lg %r7,16(%r3)
mlgr %r6,%r7
stg %r7,32(%r2)
stg %r6,40(%r2)
lg %r7,24(%r3)
mlgr %r6,%r7
stg %r7,48(%r2)
stg %r6,56(%r2)
la %r3,32(%r3)
la %r2,64(%r2)
brct %r1,.Loop4_sqr
lghi %r1,3
nr %r4,%r1 // cnt=len%4
jz .Lend_sqr
.Loop1_sqr:
lg %r7,0(%r3)
mlgr %r6,%r7
stg %r7,0(%r2)
stg %r6,8(%r2)
la %r3,8(%r3)
la %r2,16(%r2)
brct %r4,.Loop1_sqr
.Lend_sqr:
lmg %r6,%r7,48(%r15)
br %r14
.size bn_sqr_words,.-bn_sqr_words
// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
.globl bn_div_words
.type bn_div_words,@function
.align 4
bn_div_words:
dlgr %r2,%r4
lgr %r2,%r3
br %r14
.size bn_div_words,.-bn_div_words
// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
.globl bn_add_words
.type bn_add_words,@function
.align 4
bn_add_words:
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0
ltgfr %r5,%r5
bler %r14 // if (len<=0) return 0;
stg %r6,48(%r15)
lghi %r6,3
nr %r6,%r5 // len%4
sra %r5,2 // len/4, use sra because it sets condition code
jz .Loop1_add // carry is incidentally cleared if branch taken
algr %r2,%r2 // clear carry
.Loop4_add:
lg %r0,0(%r2,%r3)
alcg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
lg %r0,8(%r2,%r3)
alcg %r0,8(%r2,%r4)
stg %r0,8(%r2,%r1)
lg %r0,16(%r2,%r3)
alcg %r0,16(%r2,%r4)
stg %r0,16(%r2,%r1)
lg %r0,24(%r2,%r3)
alcg %r0,24(%r2,%r4)
stg %r0,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r5,.Loop4_add
la %r6,1(%r6) // see if len%4 is zero ...
brct %r6,.Loop1_add // without touching condition code:-)
.Lexit_add:
lghi %r2,0
alcgr %r2,%r2
lg %r6,48(%r15)
br %r14
.Loop1_add:
lg %r0,0(%r2,%r3)
alcg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
la %r2,8(%r2) // i++
brct %r6,.Loop1_add
j .Lexit_add
.size bn_add_words,.-bn_add_words
// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
.globl bn_sub_words
.type bn_sub_words,@function
.align 4
bn_sub_words:
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0
ltgfr %r5,%r5
bler %r14 // if (len<=0) return 0;
stg %r6,48(%r15)
lghi %r6,3
nr %r6,%r5 // len%4
sra %r5,2 // len/4, use sra because it sets condition code
jnz .Loop4_sub // borrow is incidentally cleared if branch taken
slgr %r2,%r2 // clear borrow
.Loop1_sub:
lg %r0,0(%r2,%r3)
slbg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
la %r2,8(%r2) // i++
brct %r6,.Loop1_sub
j .Lexit_sub
.Loop4_sub:
lg %r0,0(%r2,%r3)
slbg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
lg %r0,8(%r2,%r3)
slbg %r0,8(%r2,%r4)
stg %r0,8(%r2,%r1)
lg %r0,16(%r2,%r3)
slbg %r0,16(%r2,%r4)
stg %r0,16(%r2,%r1)
lg %r0,24(%r2,%r3)
slbg %r0,24(%r2,%r4)
stg %r0,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r5,.Loop4_sub
la %r6,1(%r6) // see if len%4 is zero ...
brct %r6,.Loop1_sub // without touching condition code:-)
.Lexit_sub:
lghi %r2,0
slbgr %r2,%r2
lcgr %r2,%r2
lg %r6,48(%r15)
br %r14
.size bn_sub_words,.-bn_sub_words
#define c1 %r1
#define c2 %r5
#define c3 %r8
#define mul_add_c(ai,bi,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlg %r6,bi*8(%r4); \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
.globl bn_mul_comba8
.type bn_mul_comba8,@function
.align 4
bn_mul_comba8:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
mul_add_c(0,0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
mul_add_c(0,1,c2,c3,c1);
mul_add_c(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
mul_add_c(2,0,c3,c1,c2);
mul_add_c(1,1,c3,c1,c2);
mul_add_c(0,2,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
mul_add_c(0,3,c1,c2,c3);
mul_add_c(1,2,c1,c2,c3);
mul_add_c(2,1,c1,c2,c3);
mul_add_c(3,0,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
mul_add_c(4,0,c2,c3,c1);
mul_add_c(3,1,c2,c3,c1);
mul_add_c(2,2,c2,c3,c1);
mul_add_c(1,3,c2,c3,c1);
mul_add_c(0,4,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
mul_add_c(0,5,c3,c1,c2);
mul_add_c(1,4,c3,c1,c2);
mul_add_c(2,3,c3,c1,c2);
mul_add_c(3,2,c3,c1,c2);
mul_add_c(4,1,c3,c1,c2);
mul_add_c(5,0,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
mul_add_c(6,0,c1,c2,c3);
mul_add_c(5,1,c1,c2,c3);
mul_add_c(4,2,c1,c2,c3);
mul_add_c(3,3,c1,c2,c3);
mul_add_c(2,4,c1,c2,c3);
mul_add_c(1,5,c1,c2,c3);
mul_add_c(0,6,c1,c2,c3);
stg c1,6*8(%r2)
lghi c1,0
mul_add_c(0,7,c2,c3,c1);
mul_add_c(1,6,c2,c3,c1);
mul_add_c(2,5,c2,c3,c1);
mul_add_c(3,4,c2,c3,c1);
mul_add_c(4,3,c2,c3,c1);
mul_add_c(5,2,c2,c3,c1);
mul_add_c(6,1,c2,c3,c1);
mul_add_c(7,0,c2,c3,c1);
stg c2,7*8(%r2)
lghi c2,0
mul_add_c(7,1,c3,c1,c2);
mul_add_c(6,2,c3,c1,c2);
mul_add_c(5,3,c3,c1,c2);
mul_add_c(4,4,c3,c1,c2);
mul_add_c(3,5,c3,c1,c2);
mul_add_c(2,6,c3,c1,c2);
mul_add_c(1,7,c3,c1,c2);
stg c3,8*8(%r2)
lghi c3,0
mul_add_c(2,7,c1,c2,c3);
mul_add_c(3,6,c1,c2,c3);
mul_add_c(4,5,c1,c2,c3);
mul_add_c(5,4,c1,c2,c3);
mul_add_c(6,3,c1,c2,c3);
mul_add_c(7,2,c1,c2,c3);
stg c1,9*8(%r2)
lghi c1,0
mul_add_c(7,3,c2,c3,c1);
mul_add_c(6,4,c2,c3,c1);
mul_add_c(5,5,c2,c3,c1);
mul_add_c(4,6,c2,c3,c1);
mul_add_c(3,7,c2,c3,c1);
stg c2,10*8(%r2)
lghi c2,0
mul_add_c(4,7,c3,c1,c2);
mul_add_c(5,6,c3,c1,c2);
mul_add_c(6,5,c3,c1,c2);
mul_add_c(7,4,c3,c1,c2);
stg c3,11*8(%r2)
lghi c3,0
mul_add_c(7,5,c1,c2,c3);
mul_add_c(6,6,c1,c2,c3);
mul_add_c(5,7,c1,c2,c3);
stg c1,12*8(%r2)
lghi c1,0
mul_add_c(6,7,c2,c3,c1);
mul_add_c(7,6,c2,c3,c1);
stg c2,13*8(%r2)
lghi c2,0
mul_add_c(7,7,c3,c1,c2);
stg c3,14*8(%r2)
stg c1,15*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_mul_comba8,.-bn_mul_comba8
// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
.globl bn_mul_comba4
.type bn_mul_comba4,@function
.align 4
bn_mul_comba4:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
mul_add_c(0,0,c1,c2,c3);
stg c1,0*8(%r3)
lghi c1,0
mul_add_c(0,1,c2,c3,c1);
mul_add_c(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
mul_add_c(2,0,c3,c1,c2);
mul_add_c(1,1,c3,c1,c2);
mul_add_c(0,2,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
mul_add_c(0,3,c1,c2,c3);
mul_add_c(1,2,c1,c2,c3);
mul_add_c(2,1,c1,c2,c3);
mul_add_c(3,0,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
mul_add_c(3,1,c2,c3,c1);
mul_add_c(2,2,c2,c3,c1);
mul_add_c(1,3,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
mul_add_c(2,3,c3,c1,c2);
mul_add_c(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
mul_add_c(3,3,c1,c2,c3);
stg c1,6*8(%r2)
stg c2,7*8(%r2)
stmg %r6,%r8,48(%r15)
br %r14
.size bn_mul_comba4,.-bn_mul_comba4
#define sqr_add_c(ai,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlgr %r6,%r7; \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
#define sqr_add_c2(ai,aj,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlg %r6,aj*8(%r3); \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero; \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
.globl bn_sqr_comba8
.type bn_sqr_comba8,@function
.align 4
bn_sqr_comba8:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
sqr_add_c(0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
sqr_add_c2(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
sqr_add_c(1,c3,c1,c2);
sqr_add_c2(2,0,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
sqr_add_c2(3,0,c1,c2,c3);
sqr_add_c2(2,1,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
sqr_add_c(2,c2,c3,c1);
sqr_add_c2(3,1,c2,c3,c1);
sqr_add_c2(4,0,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
sqr_add_c2(5,0,c3,c1,c2);
sqr_add_c2(4,1,c3,c1,c2);
sqr_add_c2(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
sqr_add_c(3,c1,c2,c3);
sqr_add_c2(4,2,c1,c2,c3);
sqr_add_c2(5,1,c1,c2,c3);
sqr_add_c2(6,0,c1,c2,c3);
stg c1,6*8(%r2)
lghi c1,0
sqr_add_c2(7,0,c2,c3,c1);
sqr_add_c2(6,1,c2,c3,c1);
sqr_add_c2(5,2,c2,c3,c1);
sqr_add_c2(4,3,c2,c3,c1);
stg c2,7*8(%r2)
lghi c2,0
sqr_add_c(4,c3,c1,c2);
sqr_add_c2(5,3,c3,c1,c2);
sqr_add_c2(6,2,c3,c1,c2);
sqr_add_c2(7,1,c3,c1,c2);
stg c3,8*8(%r2)
lghi c3,0
sqr_add_c2(7,2,c1,c2,c3);
sqr_add_c2(6,3,c1,c2,c3);
sqr_add_c2(5,4,c1,c2,c3);
stg c1,9*8(%r2)
lghi c1,0
sqr_add_c(5,c2,c3,c1);
sqr_add_c2(6,4,c2,c3,c1);
sqr_add_c2(7,3,c2,c3,c1);
stg c2,10*8(%r2)
lghi c2,0
sqr_add_c2(7,4,c3,c1,c2);
sqr_add_c2(6,5,c3,c1,c2);
stg c3,11*8(%r2)
lghi c3,0
sqr_add_c(6,c1,c2,c3);
sqr_add_c2(7,5,c1,c2,c3);
stg c1,12*8(%r2)
lghi c1,0
sqr_add_c2(7,6,c2,c3,c1);
stg c2,13*8(%r2)
lghi c2,0
sqr_add_c(7,c3,c1,c2);
stg c3,14*8(%r2)
stg c1,15*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_sqr_comba8,.-bn_sqr_comba8
// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
.globl bn_sqr_comba4
.type bn_sqr_comba4,@function
.align 4
bn_sqr_comba4:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
sqr_add_c(0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
sqr_add_c2(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
sqr_add_c(1,c3,c1,c2);
sqr_add_c2(2,0,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
sqr_add_c2(3,0,c1,c2,c3);
sqr_add_c2(2,1,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
sqr_add_c(2,c2,c3,c1);
sqr_add_c2(3,1,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
sqr_add_c2(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
sqr_add_c(3,c1,c2,c3);
stg c1,6*8(%r2)
stg c2,7*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_sqr_comba4,.-bn_sqr_comba4

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,190 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# October 2012
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: one suitable
# for all SPARCv9 processors and one for VIS3-capable ones. Former
# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
# ~100-230% faster than gcc-generated code and ~35-90% faster than
# the pure SPARCv9 code path.
$locals=16*8;
$tab="%l0";
@T=("%g2","%g3");
@i=("%g4","%g5");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
$code.=<<___;
#include <sparc_arch.h>
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
.globl bn_GF2m_mul_2x2
.align 16
bn_GF2m_mul_2x2:
SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
ld [%g1+0],%g1 ! OPENSSL_sparcv9cap_P[0]
andcc %g1, SPARCV9_VIS3, %g0
bz,pn %icc,.Lsoftware
nop
sllx %o1, 32, %o1
sllx %o3, 32, %o3
or %o2, %o1, %o1
or %o4, %o3, %o3
.word 0x95b262ab ! xmulx %o1, %o3, %o2
.word 0x99b262cb ! xmulxhi %o1, %o3, %o4
srlx %o2, 32, %o1 ! 13 cycles later
st %o2, [%o0+0]
st %o1, [%o0+4]
srlx %o4, 32, %o3
st %o4, [%o0+8]
retl
st %o3, [%o0+12]
.align 16
.Lsoftware:
save %sp,-STACK_FRAME-$locals,%sp
sllx %i1,32,$a
mov -1,$a12
sllx %i3,32,$b
or %i2,$a,$a
srlx $a12,1,$a48 ! 0x7fff...
or %i4,$b,$b
srlx $a12,2,$a12 ! 0x3fff...
add %sp,STACK_BIAS+STACK_FRAME,$tab
sllx $a,2,$a4
mov $a,$a1
sllx $a,1,$a2
srax $a4,63,@i[1] ! broadcast 61st bit
and $a48,$a4,$a4 ! (a<<2)&0x7fff...
srlx $a48,2,$a48
srax $a2,63,@i[0] ! broadcast 62nd bit
and $a12,$a2,$a2 ! (a<<1)&0x3fff...
srax $a1,63,$lo ! broadcast 63rd bit
and $a48,$a1,$a1 ! (a<<0)&0x1fff...
sllx $a1,3,$a8
and $b,$lo,$lo
and $b,@i[0],@i[0]
and $b,@i[1],@i[1]
stx %g0,[$tab+0*8] ! tab[0]=0
xor $a1,$a2,$a12
stx $a1,[$tab+1*8] ! tab[1]=a1
stx $a2,[$tab+2*8] ! tab[2]=a2
xor $a4,$a8,$a48
stx $a12,[$tab+3*8] ! tab[3]=a1^a2
xor $a4,$a1,$a1
stx $a4,[$tab+4*8] ! tab[4]=a4
xor $a4,$a2,$a2
stx $a1,[$tab+5*8] ! tab[5]=a1^a4
xor $a4,$a12,$a12
stx $a2,[$tab+6*8] ! tab[6]=a2^a4
xor $a48,$a1,$a1
stx $a12,[$tab+7*8] ! tab[7]=a1^a2^a4
xor $a48,$a2,$a2
stx $a8,[$tab+8*8] ! tab[8]=a8
xor $a48,$a12,$a12
stx $a1,[$tab+9*8] ! tab[9]=a1^a8
xor $a4,$a1,$a1
stx $a2,[$tab+10*8] ! tab[10]=a2^a8
xor $a4,$a2,$a2
stx $a12,[$tab+11*8] ! tab[11]=a1^a2^a8
xor $a4,$a12,$a12
stx $a48,[$tab+12*8] ! tab[12]=a4^a8
srlx $lo,1,$hi
stx $a1,[$tab+13*8] ! tab[13]=a1^a4^a8
sllx $lo,63,$lo
stx $a2,[$tab+14*8] ! tab[14]=a2^a4^a8
srlx @i[0],2,@T[0]
stx $a12,[$tab+15*8] ! tab[15]=a1^a2^a4^a8
sllx @i[0],62,$a1
sllx $b,3,@i[0]
srlx @i[1],3,@T[1]
and @i[0],`0xf<<3`,@i[0]
sllx @i[1],61,$a2
ldx [$tab+@i[0]],@i[0]
srlx $b,4-3,@i[1]
xor @T[0],$hi,$hi
and @i[1],`0xf<<3`,@i[1]
xor $a1,$lo,$lo
ldx [$tab+@i[1]],@i[1]
xor @T[1],$hi,$hi
xor @i[0],$lo,$lo
srlx $b,8-3,@i[0]
xor $a2,$lo,$lo
and @i[0],`0xf<<3`,@i[0]
___
for($n=1;$n<14;$n++) {
$code.=<<___;
sllx @i[1],`$n*4`,@T[0]
ldx [$tab+@i[0]],@i[0]
srlx @i[1],`64-$n*4`,@T[1]
xor @T[0],$lo,$lo
srlx $b,`($n+2)*4`-3,@i[1]
xor @T[1],$hi,$hi
and @i[1],`0xf<<3`,@i[1]
___
push(@i,shift(@i)); push(@T,shift(@T));
}
$code.=<<___;
sllx @i[1],`$n*4`,@T[0]
ldx [$tab+@i[0]],@i[0]
srlx @i[1],`64-$n*4`,@T[1]
xor @T[0],$lo,$lo
sllx @i[0],`($n+1)*4`,@T[0]
xor @T[1],$hi,$hi
srlx @i[0],`64-($n+1)*4`,@T[1]
xor @T[0],$lo,$lo
xor @T[1],$hi,$hi
srlx $lo,32,%i1
st $lo,[%i0+0]
st %i1,[%i0+4]
srlx $hi,32,%i2
st $hi,[%i0+8]
st %i2,[%i0+12]
ret
restore
.type bn_GF2m_mul_2x2,#function
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.asciz "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,606 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# December 2005
#
# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
# for undertaken effort are multiple. First of all, UltraSPARC is not
# the whole SPARCv9 universe and other VIS-free implementations deserve
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
# several integrated RSA/DSA accelerator circuits accessible through
# kernel driver [only(*)], but having decent user-land software
# implementation is important too. Finally, reasons like desire to
# experiment with dedicated squaring procedure. Yes, this module
# implements one, because it was easiest to draft it in SPARCv9
# instructions...
# (*) Engine accessing the driver in question is on my TODO list.
# For reference, acceleator is estimated to give 6 to 10 times
# improvement on single-threaded RSA sign. It should be noted
# that 6-10x improvement coefficient does not actually mean
# something extraordinary in terms of absolute [single-threaded]
# performance, as SPARCv9 instruction set is by all means least
# suitable for high performance crypto among other 64 bit
# platforms. 6-10x factor simply places T1 in same performance
# domain as say AMD64 and IA-64. Improvement of RSA verify don't
# appear impressive at all, but it's the sign operation which is
# far more critical/interesting.
# You might notice that inner loops are modulo-scheduled:-) This has
# essentially negligible impact on UltraSPARC performance, it's
# Fujitsu SPARC64 V users who should notice and hopefully appreciate
# the advantage... Currently this module surpasses sparcv9a-mont.pl
# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
# module still have hidden potential [see TODO list there], which is
# estimated to be larger than 20%...
# int bn_mul_mont(
$rp="%i0"; # BN_ULONG *rp,
$ap="%i1"; # const BN_ULONG *ap,
$bp="%i2"; # const BN_ULONG *bp,
$np="%i3"; # const BN_ULONG *np,
$n0="%i4"; # const BN_ULONG *n0,
$num="%i5"; # int num);
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=128; }
$car0="%o0";
$car1="%o1";
$car2="%o2"; # 1 bit
$acc0="%o3";
$acc1="%o4";
$mask="%g1"; # 32 bits, what a waste...
$tmp0="%g4";
$tmp1="%g5";
$i="%l0";
$j="%l1";
$mul0="%l2";
$mul1="%l3";
$tp="%l4";
$apj="%l5";
$npj="%l6";
$tpj="%l7";
$fname="bn_mul_mont_int";
$code=<<___;
.section ".text",#alloc,#execinstr
.global $fname
.align 32
$fname:
cmp %o5,4 ! 128 bits minimum
bge,pt %icc,.Lenter
sethi %hi(0xffffffff),$mask
retl
clr %o0
.align 32
.Lenter:
save %sp,-$frame,%sp
sll $num,2,$num ! num*=4
or $mask,%lo(0xffffffff),$mask
ld [$n0],$n0
cmp $ap,$bp
and $num,$mask,$num
ld [$bp],$mul0 ! bp[0]
nop
add %sp,$bias,%o7 ! real top of stack
ld [$ap],$car0 ! ap[0] ! redundant in squaring context
sub %o7,$num,%o7
ld [$ap+4],$apj ! ap[1]
and %o7,-1024,%o7
ld [$np],$car1 ! np[0]
sub %o7,$bias,%sp ! alloca
ld [$np+4],$npj ! np[1]
be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
mov 12,$j
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
and $car0,$mask,$acc0
add %sp,$bias+$frame,$tp
ld [$ap+8],$apj !prologue!
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
srlx $car0,32,$car0
add $acc0,$car1,$car1
ld [$np+8],$npj !prologue!
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.L1st:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
add $acc1,$car1,$car1
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$car1,$car1
add $j,4,$j ! j++
mov $tmp0,$acc0
st $car1,[$tp]
cmp $j,$num
mov $tmp1,$acc1
srlx $car1,32,$car1
bl %icc,.L1st
add $tp,4,$tp ! tp++
!.L1st
mulx $apj,$mul0,$tmp0 !epilogue!
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
add $acc1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $tmp0,$car0,$car0
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car1
add $car0,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
mov 4,$i ! i++
ld [$bp+4],$mul0 ! bp[1]
.Louter:
add %sp,$bias+$frame,$tp
ld [$ap],$car0 ! ap[0]
ld [$ap+4],$apj ! ap[1]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
ld [$tp],$tmp1 ! tp[0]
ld [$tp+4],$tpj ! tp[1]
mov 12,$j
mulx $car0,$mul0,$car0
mulx $apj,$mul0,$tmp0 !prologue!
add $tmp1,$car0,$car0
ld [$ap+8],$apj !prologue!
and $car0,$mask,$acc0
mulx $n0,$acc0,$mul1
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1
mulx $npj,$mul1,$acc1 !prologue!
srlx $car0,32,$car0
add $acc0,$car1,$car1
ld [$np+8],$npj !prologue!
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.Linner:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $tpj,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
add $acc0,$car0,$car0
add $acc1,$car1,$car1
ld [$np+$j],$npj ! np[j]
and $car0,$mask,$acc0
ld [$tp+8],$tpj ! tp[j]
srlx $car0,32,$car0
add $acc0,$car1,$car1
add $j,4,$j ! j++
mov $tmp0,$acc0
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
mov $tmp1,$acc1
cmp $j,$num
bl %icc,.Linner
add $tp,4,$tp ! tp++
!.Linner
mulx $apj,$mul0,$tmp0 !epilogue!
mulx $npj,$mul1,$tmp1
add $tpj,$car0,$car0
add $acc0,$car0,$car0
ld [$tp+8],$tpj ! tp[j]
and $car0,$mask,$acc0
add $acc1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $tpj,$car0,$car0
add $tmp0,$car0,$car0
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
add $acc0,$car1,$car1
st $car1,[$tp+4] ! tp[j-1]
srlx $car0,32,$car0
add $i,4,$i ! i++
srlx $car1,32,$car1
add $car0,$car1,$car1
cmp $i,$num
add $car2,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
bl,a %icc,.Louter
ld [$bp+$i],$mul0 ! bp[i]
!.Louter
add $tp,12,$tp
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
mov $tp,$ap
sub %g0,$num,%o7 ! k=-num
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
.align 16
.Lsub:
ld [$tp+%o7],%o0
ld [$np+%o7],%o1
subccc %o0,%o1,%o1 ! tp[j]-np[j]
add $rp,%o7,$i
add %o7,4,%o7
brnz %o7,.Lsub
st %o1,[$i]
subc $car2,0,$car2 ! handle upmost overflow bit
and $tp,$car2,$ap
andn $rp,$car2,$np
or $ap,$np,$ap
sub %g0,$num,%o7
.Lcopy:
ld [$ap+%o7],%o0 ! copy or in-place refresh
st %g0,[$tp+%o7] ! zap tp
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
nop
mov 1,%i0
ret
restore
___
########
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
$sbit="%i2"; # re-use $bp!
$code.=<<___;
.align 32
.Lbn_sqr_mont:
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
mulx $apj,$mul0,$tmp0 !prologue!
and $car0,$mask,$acc0
add %sp,$bias+$frame,$tp
ld [$ap+8],$apj !prologue!
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
srlx $car0,32,$car0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
mulx $npj,$mul1,$acc1 !prologue!
and $car0,1,$sbit
ld [$np+8],$npj !prologue!
srlx $car0,1,$car0
add $acc0,$car1,$car1
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.Lsqr_1st:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0 ! ap[j]*a0+c0
add $acc1,$car1,$car1
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
mov $tmp1,$acc1
srlx $acc0,32,$sbit
add $j,4,$j ! j++
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
st $car1,[$tp]
mov $tmp0,$acc0
srlx $car1,32,$car1
bl %icc,.Lsqr_1st
add $tp,4,$tp ! tp++
!.Lsqr_1st
mulx $apj,$mul0,$tmp0 ! epilogue
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0 ! ap[j]*a0+c0
add $acc1,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $tmp0,$car0,$car0 ! ap[j]*a0+c0
add $tmp1,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
ld [%sp+$bias+$frame],$tmp0 ! tp[0]
ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
ld [%sp+$bias+$frame+8],$tpj ! tp[2]
ld [$ap+4],$mul0 ! ap[1]
ld [$ap+8],$apj ! ap[2]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp0,$mul1
mulx $mul0,$mul0,$car0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1
mulx $npj,$mul1,$acc1
add $tmp0,$car1,$car1
and $car0,$mask,$acc0
ld [$np+8],$npj ! np[2]
srlx $car1,32,$car1
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
and $car0,1,$sbit
add $acc1,$car1,$car1
srlx $car0,1,$car0
mov 12,$j
st $car1,[%sp+$bias+$frame] ! tp[0]=
srlx $car1,32,$car1
add %sp,$bias+$frame+4,$tp
.Lsqr_2nd:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc1,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
bl %icc,.Lsqr_2nd
add $tp,4,$tp ! tp++
!.Lsqr_2nd
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
ld [$ap+8],$mul0 ! ap[2]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp1,$mul1
and $mul1,$mask,$mul1
mov 8,$i
mulx $mul0,$mul0,$car0
mulx $car1,$mul1,$car1
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add %sp,$bias+$frame,$tp
srlx $car1,32,$car1
and $car0,1,$sbit
srlx $car0,1,$car0
mov 4,$j
.Lsqr_outer:
.Lsqr_inner1:
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $j,4,$j
ld [$tp+8],$tpj
cmp $j,$i
add $acc1,$car1,$car1
ld [$np+$j],$npj
st $car1,[$tp]
srlx $car1,32,$car1
bl %icc,.Lsqr_inner1
add $tp,4,$tp
!.Lsqr_inner1
add $j,4,$j
ld [$ap+$j],$apj ! ap[j]
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
ld [$np+$j],$npj ! np[j]
add $acc0,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $j,4,$j
cmp $j,$num
be,pn %icc,.Lsqr_no_inner2
add $tp,4,$tp
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
or $sbit,$acc0,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
bl %icc,.Lsqr_inner2
add $tp,4,$tp ! tp++
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
add $i,4,$i ! i++
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
ld [$ap+$i],$mul0 ! ap[j]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp1,$mul1
and $mul1,$mask,$mul1
add $i,4,$tmp0
mulx $mul0,$mul0,$car0
mulx $car1,$mul1,$car1
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add %sp,$bias+$frame,$tp
srlx $car1,32,$car1
and $car0,1,$sbit
srlx $car0,1,$car0
cmp $tmp0,$num ! i<num-1
bl %icc,.Lsqr_outer
mov 4,$j
.Lsqr_last:
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $j,4,$j
ld [$tp+8],$tpj
cmp $j,$i
add $acc1,$car1,$car1
ld [$np+$j],$npj
st $car1,[$tp]
srlx $car1,32,$car1
bl %icc,.Lsqr_last
add $tp,4,$tp
!.Lsqr_last
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
ba .Ltail
add $tp,8,$tp
.type $fname,#function
.size $fname,(.-$fname)
.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 32
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,882 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
# Because unlike integer multiplier, which simply stalls whole CPU,
# FPU is fully pipelined and can effectively emit 48 bit partial
# product every cycle. Why not blended SPARC v9? One can argue that
# making this module dependent on UltraSPARC VIS extension limits its
# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
# implementations from compatibility matrix. But the rest, whole Sun
# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
# VIS extension instructions used in this module. This is considered
# good enough to not care about HAL SPARC64 users [if any] who have
# integer-only pure SPARCv9 module to "fall down" to.
# USI&II cores currently exhibit uniform 2x improvement [over pre-
# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
# performance improves few percents for shorter keys and worsens few
# percents for longer keys. This is because USIII integer multiplier
# is >3x faster than USI&II one, which is harder to match [but see
# TODO list below]. It should also be noted that SPARC64 V features
# out-of-order execution, which *might* mean that integer multiplier
# is pipelined, which in turn *might* be impossible to match... On
# additional note, SPARC64 V implements FP Multiply-Add instruction,
# which is perfectly usable in this context... In other words, as far
# as Fujitsu SPARC64 V goes, talk to the author:-)
# The implementation implies following "non-natural" limitations on
# input arguments:
# - num may not be less than 4;
# - num has to be even;
# Failure to meet either condition has no fatal effects, simply
# doesn't give any performance gain.
# TODO:
# - modulo-schedule inner loop for better performance (on in-order
# execution core such as UltraSPARC this shall result in further
# noticeable(!) improvement);
# - dedicated squaring procedure[?];
######################################################################
# November 2006
#
# Modulo-scheduled inner loops allow to interleave floating point and
# integer instructions and minimize Read-After-Write penalties. This
# results in *further* 20-50% perfromance improvement [depending on
# key length, more for longer keys] on USI&II cores and 30-80% - on
# USIII&IV.
$fname="bn_mul_mont_fpu";
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) {
$bias=2047;
$frame=192;
} else {
$bias=0;
$frame=128; # 96 rounded up to largest known cache-line
}
$locals=64;
# In order to provide for 32-/64-bit ABI duality, I keep integers wider
# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
# exclusively for pointers, indexes and other small values...
# int bn_mul_mont(
$rp="%i0"; # BN_ULONG *rp,
$ap="%i1"; # const BN_ULONG *ap,
$bp="%i2"; # const BN_ULONG *bp,
$np="%i3"; # const BN_ULONG *np,
$n0="%i4"; # const BN_ULONG *n0,
$num="%i5"; # int num);
$tp="%l0"; # t[num]
$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
$ap_h="%l2"; # to these four vectors as double-precision FP values.
$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
$np_h="%l4"; # loop and L1-cache aliasing is minimized...
$i="%l5";
$j="%l6";
$mask="%l7"; # 16-bit mask, 0xffff
$n0="%g4"; # reassigned(!) to "64-bit" register
$carry="%i4"; # %i4 reused(!) for a carry bit
# FP register naming chart
#
# ..HILO
# dcba
# --------
# LOa
# LOb
# LOc
# LOd
# HIa
# HIb
# HIc
# HId
# ..a
# ..b
$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
$dota="%f24"; $dotb="%f26";
$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
$code=<<___;
.section ".text",#alloc,#execinstr
.global $fname
.align 32
$fname:
save %sp,-$frame-$locals,%sp
cmp $num,4
bl,a,pn %icc,.Lret
clr %i0
andcc $num,1,%g0 ! $num has to be even...
bnz,a,pn %icc,.Lret
clr %i0 ! signal "unsupported input value"
srl $num,1,$num
sethi %hi(0xffff),$mask
ld [%i4+0],$n0 ! $n0 reassigned, remember?
or $mask,%lo(0xffff),$mask
ld [%i4+4],%o0
sllx %o0,32,%o0
or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
sll $num,3,$num ! num*=8
add %sp,$bias,%o0 ! real top of stack
sll $num,2,%o1
add %o1,$num,%o1 ! %o1=num*5
sub %o0,%o1,%o0
and %o0,-2048,%o0 ! optimize TLB utilization
sub %o0,$bias,%sp ! alloca(5*num*8)
rd %asi,%o7 ! save %asi
add %sp,$bias+$frame+$locals,$tp
add $tp,$num,$ap_l
add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
add $ap_l,$num,$ap_h
add $ap_h,$num,$np_l
add $np_l,$num,$np_h
wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
add $rp,$num,$rp ! readjust input pointers to point
add $ap,$num,$ap ! at the ends too...
add $bp,$num,$bp
add $np,$num,$np
stx %o7,[%sp+$bias+$frame+48] ! save %asi
sub %g0,$num,$i ! i=-num
sub %g0,$num,$j ! j=-num
add $ap,$j,%o3
add $bp,$i,%o4
ld [%o3+4],%g1 ! bp[0]
ld [%o3+0],%o0
ld [%o4+4],%g5 ! ap[0]
sllx %g1,32,%g1
ld [%o4+0],%o1
sllx %g5,32,%g5
or %g1,%o0,%o0
or %g5,%o1,%o1
add $np,$j,%o5
mulx %o1,%o0,%o0 ! ap[0]*bp[0]
mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
stx %o0,[%sp+$bias+$frame+0]
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o3+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,$ba
fxtod $alo,$alo
ldda [%o4+0]%asi,$bb
fxtod $ahi,$ahi
ldda [%o4+6]%asi,$bc
fxtod $nlo,$nlo
ldda [%o4+4]%asi,$bd
fxtod $nhi,$nhi
! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
ldda [%sp+$bias+$frame+6]%asi,$na
fxtod $ba,$ba
ldda [%sp+$bias+$frame+4]%asi,$nb
fxtod $bb,$bb
ldda [%sp+$bias+$frame+2]%asi,$nc
fxtod $bc,$bc
ldda [%sp+$bias+$frame+0]%asi,$nd
fxtod $bd,$bd
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fxtod $na,$na
std $ahi,[$ap_h+$j]
fxtod $nb,$nb
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fxtod $nc,$nc
std $nhi,[$np_h+$j]
fxtod $nd,$nd
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
fmuld $alo,$bd,$alod
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
fmuld $ahi,$ba,$ahia
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
add $j,8,$j
std $nlob,[%sp+$bias+$frame+8]
add $ap,$j,%o4
std $nloc,[%sp+$bias+$frame+16]
add $np,$j,%o5
std $nlod,[%sp+$bias+$frame+24]
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o4+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
fxtod $alo,$alo
fxtod $ahi,$ahi
fxtod $nlo,$nlo
fxtod $nhi,$nhi
ldx [%sp+$bias+$frame+0],%o0
fmuld $alo,$ba,$aloa
ldx [%sp+$bias+$frame+8],%o1
fmuld $nlo,$na,$nloa
ldx [%sp+$bias+$frame+16],%o2
fmuld $alo,$bb,$alob
ldx [%sp+$bias+$frame+24],%o3
fmuld $nlo,$nb,$nlob
srlx %o0,16,%o7
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fmuld $alo,$bc,$aloc
add %o7,%o1,%o1
std $ahi,[$ap_h+$j]
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
srlx %o1,16,%o7
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fmuld $alo,$bd,$alod
add %o7,%o2,%o2
std $nhi,[$np_h+$j]
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
srlx %o2,16,%o7
fmuld $ahi,$ba,$ahia
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
!and %o0,$mask,%o0
!and %o1,$mask,%o1
!and %o2,$mask,%o2
!sllx %o1,16,%o1
!sllx %o2,32,%o2
!sllx %o3,48,%o7
!or %o1,%o0,%o0
!or %o2,%o0,%o0
!or %o7,%o0,%o0 ! 64-bit result
srlx %o3,16,%g1 ! 34-bit carry
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $dota,$nloa,$nloa
faddd $dotb,$nlob,$nlob
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
addcc $j,8,$j
std $nloc,[%sp+$bias+$frame+16]
bz,pn %icc,.L1stskip
std $nlod,[%sp+$bias+$frame+24]
.align 32 ! incidentally already aligned !
.L1st:
add $ap,$j,%o4
add $np,$j,%o5
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o4+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
fxtod $alo,$alo
fxtod $ahi,$ahi
fxtod $nlo,$nlo
fxtod $nhi,$nhi
ldx [%sp+$bias+$frame+0],%o0
fmuld $alo,$ba,$aloa
ldx [%sp+$bias+$frame+8],%o1
fmuld $nlo,$na,$nloa
ldx [%sp+$bias+$frame+16],%o2
fmuld $alo,$bb,$alob
ldx [%sp+$bias+$frame+24],%o3
fmuld $nlo,$nb,$nlob
srlx %o0,16,%o7
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fmuld $alo,$bc,$aloc
add %o7,%o1,%o1
std $ahi,[$ap_h+$j]
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
srlx %o1,16,%o7
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fmuld $alo,$bd,$alod
add %o7,%o2,%o2
std $nhi,[$np_h+$j]
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
srlx %o2,16,%o7
fmuld $ahi,$ba,$ahia
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
and %o1,$mask,%o1
and %o2,$mask,%o2
fmuld $ahi,$bb,$ahib
sllx %o1,16,%o1
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
sllx %o2,32,%o2
fmuld $ahi,$bc,$ahic
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
or %o2,%o0,%o0
fmuld $ahi,$bd,$ahid
or %o7,%o0,%o0 ! 64-bit result
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
addcc %g1,%o0,%o0
faddd $dota,$nloa,$nloa
srlx %o3,16,%g1 ! 34-bit carry
faddd $dotb,$nlob,$nlob
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]=
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
std $nlod,[%sp+$bias+$frame+24]
addcc $j,8,$j
bnz,pt %icc,.L1st
add $tp,8,$tp
.L1stskip:
fdtox $dota,$dota
fdtox $dotb,$dotb
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
ldx [%sp+$bias+$frame+24],%o3
srlx %o0,16,%o7
std $dota,[%sp+$bias+$frame+32]
add %o7,%o1,%o1
std $dotb,[%sp+$bias+$frame+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
and %o1,$mask,%o1
and %o2,$mask,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+$bias+$frame+32],%o4
addcc %g1,%o0,%o0
ldx [%sp+$bias+$frame+40],%o5
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]=
add $tp,8,$tp
srlx %o4,16,%o7
add %o7,%o5,%o5
and %o4,$mask,%o4
sllx %o5,16,%o7
or %o7,%o4,%o4
addcc %g1,%o4,%o4
srlx %o5,48,%g1
bcs,a %xcc,.+8
add %g1,1,%g1
mov %g1,$carry
stx %o4,[$tp] ! tp[num-1]=
ba .Louter
add $i,8,$i
.align 32
.Louter:
sub %g0,$num,$j ! j=-num
add %sp,$bias+$frame+$locals,$tp
add $ap,$j,%o3
add $bp,$i,%o4
ld [%o3+4],%g1 ! bp[i]
ld [%o3+0],%o0
ld [%o4+4],%g5 ! ap[0]
sllx %g1,32,%g1
ld [%o4+0],%o1
sllx %g5,32,%g5
or %g1,%o0,%o0
or %g5,%o1,%o1
ldx [$tp],%o2 ! tp[0]
mulx %o1,%o0,%o0
addcc %o2,%o0,%o0
mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
stx %o0,[%sp+$bias+$frame+0]
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,$ba
ldda [%o4+0]%asi,$bb
ldda [%o4+6]%asi,$bc
ldda [%o4+4]%asi,$bd
! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
ldda [%sp+$bias+$frame+6]%asi,$na
fxtod $ba,$ba
ldda [%sp+$bias+$frame+4]%asi,$nb
fxtod $bb,$bb
ldda [%sp+$bias+$frame+2]%asi,$nc
fxtod $bc,$bc
ldda [%sp+$bias+$frame+0]%asi,$nd
fxtod $bd,$bd
ldd [$ap_l+$j],$alo ! load a[j] in double format
fxtod $na,$na
ldd [$ap_h+$j],$ahi
fxtod $nb,$nb
ldd [$np_l+$j],$nlo ! load n[j] in double format
fxtod $nc,$nc
ldd [$np_h+$j],$nhi
fxtod $nd,$nd
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
fmuld $alo,$bd,$alod
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
fmuld $ahi,$ba,$ahia
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
add $j,8,$j
std $nlod,[%sp+$bias+$frame+24]
ldd [$ap_l+$j],$alo ! load a[j] in double format
ldd [$ap_h+$j],$ahi
ldd [$np_l+$j],$nlo ! load n[j] in double format
ldd [$np_h+$j],$nhi
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
ldx [%sp+$bias+$frame+0],%o0
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
ldx [%sp+$bias+$frame+8],%o1
fmuld $alo,$bd,$alod
ldx [%sp+$bias+$frame+16],%o2
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
ldx [%sp+$bias+$frame+24],%o3
fmuld $ahi,$ba,$ahia
srlx %o0,16,%o7
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
add %o7,%o1,%o1
fmuld $ahi,$bb,$ahib
srlx %o1,16,%o7
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
add %o7,%o2,%o2
fmuld $ahi,$bc,$ahic
srlx %o2,16,%o7
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
! why?
and %o0,$mask,%o0
fmuld $ahi,$bd,$ahid
and %o1,$mask,%o1
and %o2,$mask,%o2
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
sllx %o1,16,%o1
faddd $dota,$nloa,$nloa
sllx %o2,32,%o2
faddd $dotb,$nlob,$nlob
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahic,$nhic,$dota ! $nhic
or %o2,%o0,%o0
faddd $ahid,$nhid,$dotb ! $nhid
or %o7,%o0,%o0 ! 64-bit result
ldx [$tp],%o7
faddd $nloc,$nhia,$nloc
addcc %o7,%o0,%o0
! end-of-why?
faddd $nlod,$nhib,$nlod
srlx %o3,16,%g1 ! 34-bit carry
fdtox $nloa,$nloa
bcs,a %xcc,.+8
add %g1,1,%g1
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
addcc $j,8,$j
std $nloc,[%sp+$bias+$frame+16]
bz,pn %icc,.Linnerskip
std $nlod,[%sp+$bias+$frame+24]
ba .Linner
nop
.align 32
.Linner:
ldd [$ap_l+$j],$alo ! load a[j] in double format
ldd [$ap_h+$j],$ahi
ldd [$np_l+$j],$nlo ! load n[j] in double format
ldd [$np_h+$j],$nhi
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
ldx [%sp+$bias+$frame+0],%o0
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
ldx [%sp+$bias+$frame+8],%o1
fmuld $alo,$bd,$alod
ldx [%sp+$bias+$frame+16],%o2
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
ldx [%sp+$bias+$frame+24],%o3
fmuld $ahi,$ba,$ahia
srlx %o0,16,%o7
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
add %o7,%o1,%o1
fmuld $ahi,$bb,$ahib
srlx %o1,16,%o7
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
add %o7,%o2,%o2
fmuld $ahi,$bc,$ahic
srlx %o2,16,%o7
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
fmuld $ahi,$bd,$ahid
and %o1,$mask,%o1
and %o2,$mask,%o2
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
sllx %o1,16,%o1
faddd $dota,$nloa,$nloa
sllx %o2,32,%o2
faddd $dotb,$nlob,$nlob
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahic,$nhic,$dota ! $nhic
or %o2,%o0,%o0
faddd $ahid,$nhid,$dotb ! $nhid
or %o7,%o0,%o0 ! 64-bit result
faddd $nloc,$nhia,$nloc
addcc %g1,%o0,%o0
ldx [$tp+8],%o7 ! tp[j]
faddd $nlod,$nhib,$nlod
srlx %o3,16,%g1 ! 34-bit carry
fdtox $nloa,$nloa
bcs,a %xcc,.+8
add %g1,1,%g1
fdtox $nlob,$nlob
addcc %o7,%o0,%o0
fdtox $nloc,$nloc
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
addcc $j,8,$j
std $nlod,[%sp+$bias+$frame+24]
bnz,pt %icc,.Linner
add $tp,8,$tp
.Linnerskip:
fdtox $dota,$dota
fdtox $dotb,$dotb
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
ldx [%sp+$bias+$frame+24],%o3
srlx %o0,16,%o7
std $dota,[%sp+$bias+$frame+32]
add %o7,%o1,%o1
std $dotb,[%sp+$bias+$frame+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
and %o1,$mask,%o1
and %o2,$mask,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
ldx [%sp+$bias+$frame+32],%o4
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+$bias+$frame+40],%o5
addcc %g1,%o0,%o0
ldx [$tp+8],%o7 ! tp[j]
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
addcc %o7,%o0,%o0
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]
add $tp,8,$tp
srlx %o4,16,%o7
add %o7,%o5,%o5
and %o4,$mask,%o4
sllx %o5,16,%o7
or %o7,%o4,%o4
addcc %g1,%o4,%o4
srlx %o5,48,%g1
bcs,a %xcc,.+8
add %g1,1,%g1
addcc $carry,%o4,%o4
stx %o4,[$tp] ! tp[num-1]
mov %g1,$carry
bcs,a %xcc,.+8
add $carry,1,$carry
addcc $i,8,$i
bnz %icc,.Louter
nop
add $tp,8,$tp ! adjust tp to point at the end
orn %g0,%g0,%g4
sub %g0,$num,%o7 ! n=-num
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
.align 32
.Lsub:
ldx [$tp+%o7],%o0
add $np,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
srlx %o0,32,%o1
subccc %o0,%o2,%o2
add $rp,%o7,%g1
subccc %o1,%o3,%o3
st %o2,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lsub
st %o3,[%g1+4]
subc $carry,0,%g4
sub %g0,$num,%o7 ! n=-num
ba .Lcopy
nop
.align 32
.Lcopy:
ldx [$tp+%o7],%o0
add $rp,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
stx %g0,[$tp+%o7]
and %o0,%g4,%o0
srlx %o0,32,%o1
andn %o2,%g4,%o2
andn %o3,%g4,%o3
or %o2,%o0,%o0
or %o3,%o1,%o1
st %o0,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lcopy
st %o1,[%g1+4]
sub %g0,$num,%o7 ! n=-num
.Lzap:
stx %g0,[$ap_l+%o7]
stx %g0,[$ap_h+%o7]
stx %g0,[$np_l+%o7]
stx %g0,[$np_h+%o7]
add %o7,8,%o7
brnz,pt %o7,.Lzap
nop
ldx [%sp+$bias+$frame+48],%o7
wr %g0,%o7,%asi ! restore %asi
mov 1,%i0
.Lret:
ret
restore
.type $fname,#function
.size $fname,(.-$fname)
.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
.align 32
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
# Below substitution makes it possible to compile without demanding
# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
# dare to do this, because VIS capability is detected at run-time now
# and this routine is not called on CPU not capable to execute it. Do
# note that fzeros is not the only VIS dependency! Another dependency
# is implicit and is just _a_ numerical value loaded to %asi register,
# which assembler can't recognize as VIS specific...
$code =~ s/fzeros\s+%f([0-9]+)/
sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
/gem;
print $code;
# flush
close STDOUT;

View File

@@ -0,0 +1,242 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Wrapper around 'rep montmul', VIA-specific instruction accessing
# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
#
# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
# different software configurations on 1.5GHz VIA Esther processor.
# Lines marked with "software integer" denote performance of hand-
# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
# refers to hand-coded SSE2 Montgomery multiplication procedure found
# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
# Padlock SDK 2.0.1 available for download from VIA, which naturally
# utilizes the magic 'repz montmul' instruction. And finally "hardware
# this" refers to *this* implementation which also uses 'repz montmul'
#
# sign verify sign/s verify/s
# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
#
# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
#
# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
#
# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
#
# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
#
# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
#
# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
#
# To give you some other reference point here is output for 2.4GHz P4
# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
# SSE2" in above terms.
#
# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
#
# Conclusions:
# - VIA SDK leaves a *lot* of room for improvement (which this
# implementation successfully fills:-);
# - 'rep montmul' gives up to >3x performance improvement depending on
# key length;
# - in terms of absolute performance it delivers approximately as much
# as modern out-of-order 32-bit cores [again, for longer keys].
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"via-mont.pl");
# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
$func="bn_mul_mont_padlock";
$pad=16*1; # amount of reserved bytes on top of every vector
# stack layout
$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
$A=&DWP(4,"esp");
$B=&DWP(8,"esp");
$T=&DWP(12,"esp");
$M=&DWP(16,"esp");
$scratch=&DWP(20,"esp");
$rp=&DWP(24,"esp"); # these are mine
$sp=&DWP(28,"esp");
# &DWP(32,"esp") # 32 byte scratch area
# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
# Note that SDK suggests to unconditionally allocate 2K per vector. This
# has quite an impact on performance. It naturally depends on key length,
# but to give an example 1024 bit private RSA key operations suffer >30%
# penalty. I allocate only as much as actually required...
&function_begin($func);
&xor ("eax","eax");
&mov ("ecx",&wparam(5)); # num
# meet VIA's limitations for num [note that the specification
# expresses them in bits, while we work with amount of 32-bit words]
&test ("ecx",3);
&jnz (&label("leave")); # num % 4 != 0
&cmp ("ecx",8);
&jb (&label("leave")); # num < 8
&cmp ("ecx",1024);
&ja (&label("leave")); # num > 1024
&pushf ();
&cld ();
&mov ("edi",&wparam(0)); # rp
&mov ("eax",&wparam(1)); # ap
&mov ("ebx",&wparam(2)); # bp
&mov ("edx",&wparam(3)); # np
&mov ("esi",&wparam(4)); # n0
&mov ("esi",&DWP(0,"esi")); # *n0
&lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
&lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
&neg ("ebp");
&add ("ebp","esp");
&and ("ebp",-64); # align to cache-line
&xchg ("ebp","esp"); # alloca
&mov ($rp,"edi"); # save rp
&mov ($sp,"ebp"); # save esp
&mov ($mZeroPrime,"esi");
&lea ("esi",&DWP(64,"esp")); # tp
&mov ($T,"esi");
&lea ("edi",&DWP(32,"esp")); # scratch area
&mov ($scratch,"edi");
&mov ("esi","eax");
&lea ("ebp",&DWP(-$pad,"ecx"));
&shr ("ebp",2); # restore original num value in ebp
&xor ("eax","eax");
&mov ("ecx","ebp");
&lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("ecx","ebp");
&lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
&mov ($A,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded ap copy...
&mov ("ecx","ebp");
&mov ("esi","ebx");
&mov ($B,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded bp copy...
&mov ("ecx","ebp");
&mov ("esi","edx");
&mov ($M,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded np copy...
# let magic happen...
&mov ("ecx","ebp");
&mov ("esi","esp");
&shl ("ecx",5); # convert word counter to bit counter
&align (4);
&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
&mov ("ecx","ebp");
&lea ("esi",&DWP(64,"esp")); # tp
# edi still points at the end of padded np copy...
&neg ("ebp");
&lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
&mov ("edi",$rp); # restore rp
&xor ("edx","edx"); # i=0 and clear CF
&set_label("sub",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&sbb ("eax",&DWP(0,"ebp","edx",4));
&mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
&lea ("edx",&DWP(1,"edx")); # i++
&loop (&label("sub")); # doesn't affect CF!
&mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
&sbb ("eax",0);
&and ("esi","eax");
&not ("eax");
&mov ("ebp","edi");
&and ("ebp","eax");
&or ("esi","ebp"); # tp=carry?tp:rp
&mov ("ecx","edx"); # num
&xor ("edx","edx"); # i=0
&set_label("copy",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
&mov (&DWP(0,"edi","edx",4),"eax");
&lea ("edx",&DWP(1,"edx")); # i++
&loop (&label("copy"));
&mov ("ebp",$sp);
&xor ("eax","eax");
&mov ("ecx",64/4);
&mov ("edi","esp"); # zap frame including scratch area
&data_byte(0xf3,0xab); # rep stosl, bzero
# zap copies of ap, bp and np
&lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
&lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("esp","ebp");
&inc ("eax"); # signal "done"
&popf ();
&set_label("leave");
&function_end($func);
&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -0,0 +1,373 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2012.
#
# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
# onward. There are three new instructions used here: umulxhi,
# addxc[cc] and initializing store. On T3 RSA private key operations
# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
# lengths. This is without dedicated squaring procedure. On T4
# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
# for reference purposes, because T4 has dedicated Montgomery
# multiplication and squaring *instructions* that deliver even more.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.section ".text",#alloc,#execinstr
___
($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
# int bn_mul_mont(
$rp="%o0"; # BN_ULONG *rp,
$ap="%o1"; # const BN_ULONG *ap,
$bp="%o2"; # const BN_ULONG *bp,
$np="%o3"; # const BN_ULONG *np,
$n0p="%o4"; # const BN_ULONG *n0,
$num="%o5"; # int num); # caller ensures that num is even
# and >=6
$code.=<<___;
.globl bn_mul_mont_vis3
.align 32
bn_mul_mont_vis3:
add %sp, $bias, %g4 ! real top of stack
sll $num, 2, $num ! size in bytes
add $num, 63, %g5
andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
add %g5, %g5, %g1
add %g5, %g1, %g1 ! 3*buffer size
sub %g4, %g1, %g1
andn %g1, 63, %g1 ! align at 64 byte
sub %g1, $frame, %g1 ! new top of stack
sub %g1, %g4, %g1
save %sp, %g1, %sp
___
# +-------------------------------+<----- %sp
# . .
# +-------------------------------+<----- aligned at 64 bytes
# | __int64 tmp[0] |
# +-------------------------------+
# . .
# . .
# +-------------------------------+<----- aligned at 64 bytes
# | __int64 ap[1..0] | converted ap[]
# +-------------------------------+
# | __int64 np[1..0] | converted np[]
# +-------------------------------+
# | __int64 ap[3..2] |
# . .
# . .
# +-------------------------------+
($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
($ovf,$i)=($t0,$t1);
$code.=<<___;
ld [$n0p+0], $t0 ! pull n0[0..1] value
add %sp, $bias+$frame, $tp
ld [$n0p+4], $t1
add $tp, %g5, $anp
ld [$bp+0], $t2 ! m0=bp[0]
sllx $t1, 32, $n0
ld [$bp+4], $t3
or $t0, $n0, $n0
add $bp, 8, $bp
ld [$ap+0], $t0 ! ap[0]
sllx $t3, 32, $m0
ld [$ap+4], $t1
or $t2, $m0, $m0
ld [$ap+8], $t2 ! ap[1]
sllx $t1, 32, $aj
ld [$ap+12], $t3
or $t0, $aj, $aj
add $ap, 16, $ap
stx $aj, [$anp] ! converted ap[0]
mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
umulxhi $aj, $m0, $hi0
ld [$np+0], $t0 ! np[0]
sllx $t3, 32, $aj
ld [$np+4], $t1
or $t2, $aj, $aj
ld [$np+8], $t2 ! np[1]
sllx $t1, 32, $nj
ld [$np+12], $t3
or $t0, $nj, $nj
add $np, 16, $np
stx $nj, [$anp+8] ! converted np[0]
mulx $lo0, $n0, $m1 ! "tp[0]"*n0
stx $aj, [$anp+16] ! converted ap[1]
mulx $aj, $m0, $alo ! ap[1]*bp[0]
umulxhi $aj, $m0, $aj ! ahi=aj
mulx $nj, $m1, $lo1 ! np[0]*m1
umulxhi $nj, $m1, $hi1
sllx $t3, 32, $nj
or $t2, $nj, $nj
stx $nj, [$anp+24] ! converted np[1]
add $anp, 32, $anp
addcc $lo0, $lo1, $lo1
addxc %g0, $hi1, $hi1
mulx $nj, $m1, $nlo ! np[1]*m1
umulxhi $nj, $m1, $nj ! nhi=nj
ba .L1st
sub $num, 24, $cnt ! cnt=num-3
.align 16
.L1st:
ld [$ap+0], $t0 ! ap[j]
addcc $alo, $hi0, $lo0
ld [$ap+4], $t1
addxc $aj, %g0, $hi0
sllx $t1, 32, $aj
add $ap, 8, $ap
or $t0, $aj, $aj
stx $aj, [$anp] ! converted ap[j]
ld [$np+0], $t2 ! np[j]
addcc $nlo, $hi1, $lo1
ld [$np+4], $t3
addxc $nj, %g0, $hi1 ! nhi=nj
sllx $t3, 32, $nj
add $np, 8, $np
mulx $aj, $m0, $alo ! ap[j]*bp[0]
or $t2, $nj, $nj
umulxhi $aj, $m0, $aj ! ahi=aj
stx $nj, [$anp+8] ! converted np[j]
add $anp, 16, $anp ! anp++
mulx $nj, $m1, $nlo ! np[j]*m1
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
umulxhi $nj, $m1, $nj ! nhi=nj
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp ! tp++
brnz,pt $cnt, .L1st
sub $cnt, 8, $cnt ! j--
!.L1st
addcc $alo, $hi0, $lo0
addxc $aj, %g0, $hi0 ! ahi=aj
addcc $nlo, $hi1, $lo1
addxc $nj, %g0, $hi1
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp
addcc $hi0, $hi1, $hi1
addxc %g0, %g0, $ovf ! upmost overflow bit
stx $hi1, [$tp]
add $tp, 8, $tp
ba .Louter
sub $num, 16, $i ! i=num-2
.align 16
.Louter:
ld [$bp+0], $t2 ! m0=bp[i]
ld [$bp+4], $t3
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
add $bp, 8, $bp
sllx $t3, 32, $m0
ldx [$anp+0], $aj ! ap[0]
or $t2, $m0, $m0
ldx [$anp+8], $nj ! np[0]
mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
ldx [$tp], $tj ! tp[0]
umulxhi $aj, $m0, $hi0
ldx [$anp+16], $aj ! ap[1]
addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
mulx $aj, $m0, $alo ! ap[1]*bp[i]
addxc %g0, $hi0, $hi0
mulx $lo0, $n0, $m1 ! tp[0]*n0
umulxhi $aj, $m0, $aj ! ahi=aj
mulx $nj, $m1, $lo1 ! np[0]*m1
umulxhi $nj, $m1, $hi1
ldx [$anp+24], $nj ! np[1]
add $anp, 32, $anp
addcc $lo1, $lo0, $lo1
mulx $nj, $m1, $nlo ! np[1]*m1
addxc %g0, $hi1, $hi1
umulxhi $nj, $m1, $nj ! nhi=nj
ba .Linner
sub $num, 24, $cnt ! cnt=num-3
.align 16
.Linner:
addcc $alo, $hi0, $lo0
ldx [$tp+8], $tj ! tp[j]
addxc $aj, %g0, $hi0 ! ahi=aj
ldx [$anp+0], $aj ! ap[j]
addcc $nlo, $hi1, $lo1
mulx $aj, $m0, $alo ! ap[j]*bp[i]
addxc $nj, %g0, $hi1 ! nhi=nj
ldx [$anp+8], $nj ! np[j]
add $anp, 16, $anp
umulxhi $aj, $m0, $aj ! ahi=aj
addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
mulx $nj, $m1, $nlo ! np[j]*m1
addxc %g0, $hi0, $hi0
umulxhi $nj, $m1, $nj ! nhi=nj
addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp
brnz,pt $cnt, .Linner
sub $cnt, 8, $cnt
!.Linner
ldx [$tp+8], $tj ! tp[j]
addcc $alo, $hi0, $lo0
addxc $aj, %g0, $hi0 ! ahi=aj
addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
addxc %g0, $hi0, $hi0
addcc $nlo, $hi1, $lo1
addxc $nj, %g0, $hi1 ! nhi=nj
addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
addxccc $hi1, $hi0, $hi1
addxc %g0, %g0, $ovf
stx $hi1, [$tp+8]
add $tp, 16, $tp
brnz,pt $i, .Louter
sub $i, 8, $i
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
ba .Lsub
subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
.align 16
.Lsub:
ldx [$tp], $tj
add $tp, 8, $tp
ldx [$anp+8], $nj
add $anp, 16, $anp
subccc $tj, $nj, $t2 ! tp[j]-np[j]
srlx $tj, 32, $tj
srlx $nj, 32, $nj
subccc $tj, $nj, $t3
add $rp, 8, $rp
st $t2, [$rp-4] ! reverse order
st $t3, [$rp-8]
brnz,pt $cnt, .Lsub
sub $cnt, 8, $cnt
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
sub $rp, $num, $rp
subc $ovf, %g0, $ovf ! handle upmost overflow bit
and $tp, $ovf, $ap
andn $rp, $ovf, $np
or $np, $ap, $ap ! ap=borrow?tp:rp
ba .Lcopy
sub $num, 8, $cnt
.align 16
.Lcopy: ! copy or in-place refresh
ld [$ap+0], $t2
ld [$ap+4], $t3
add $ap, 8, $ap
stx %g0, [$tp] ! zap
add $tp, 8, $tp
stx %g0, [$anp] ! zap
stx %g0, [$anp+8]
add $anp, 16, $anp
st $t3, [$rp+0] ! flip order
st $t2, [$rp+4]
add $rp, 8, $rp
brnz $cnt, .Lcopy
sub $cnt, 8, $cnt
mov 1, %o0
ret
restore
.type bn_mul_mont_vis3, #function
.size bn_mul_mont_vis3, .-bn_mul_mont_vis3
.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis3 {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my ($ref,$opf);
my %visopf = ( "addxc" => 0x011,
"addxccc" => 0x013,
"umulxhi" => 0x016 );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%([goli])([0-9])/);
$_=$bias{$1}+$2;
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unvis3($1,$2,$3,$4)
/ge;
print $_,"\n";
}
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,313 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has three code paths: pure integer
# code suitable for any x86 CPU, MMX code suitable for PIII and later
# and PCLMULQDQ suitable for Westmere and later. Improvement varies
# from one benchmark and µ-arch to another. Below are interval values
# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
# code:
#
# PIII 16%-30%
# P4 12%-12%
# Opteron 18%-40%
# Core2 19%-44%
# Atom 38%-64%
# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
#
# Note that above improvement coefficients are not coefficients for
# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
# is more and more dominated by other subroutines, most notably by
# BN_GF2m_mod[_mul]_arr...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
$a="eax";
$b="ebx";
($a1,$a2,$a4)=("ecx","edx","ebp");
$R="mm0";
@T=("mm1","mm2");
($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
@i=("esi","edi");
if (!$x86only) {
&function_begin_B("_mul_1x1_mmx");
&sub ("esp",32+4);
&mov ($a1,$a);
&lea ($a2,&DWP(0,$a,$a));
&and ($a1,0x3fffffff);
&lea ($a4,&DWP(0,$a2,$a2));
&mov (&DWP(0*4,"esp"),0);
&and ($a2,0x7fffffff);
&movd ($A,$a);
&movd ($B,$b);
&mov (&DWP(1*4,"esp"),$a1); # a1
&xor ($a1,$a2); # a1^a2
&pxor ($B31,$B31);
&pxor ($B30,$B30);
&mov (&DWP(2*4,"esp"),$a2); # a2
&xor ($a2,$a4); # a2^a4
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
&pcmpgtd($B31,$A); # broadcast 31st bit
&paddd ($A,$A); # $A<<=1
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
&mov (&DWP(4*4,"esp"),$a4); # a4
&xor ($a4,$a2); # a2=a4^a2^a4
&pand ($B31,$B);
&pcmpgtd($B30,$A); # broadcast 30th bit
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
&xor ($a4,$a1); # a1^a2^a4
&psllq ($B31,31);
&pand ($B30,$B);
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
&mov (@i[0],0x7);
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
&mov ($a4,@i[0]);
&and (@i[0],$b);
&shr ($b,3);
&mov (@i[1],$a4);
&psllq ($B30,30);
&and (@i[1],$b);
&shr ($b,3);
&movd ($R,&DWP(0,"esp",@i[0],4));
&mov (@i[0],$a4);
&and (@i[0],$b);
&shr ($b,3);
for($n=1;$n<9;$n++) {
&movd (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@i[1],$a4);
&psllq (@T[1],3*$n);
&and (@i[1],$b);
&shr ($b,3);
&pxor ($R,@T[1]);
push(@i,shift(@i)); push(@T,shift(@T));
}
&movd (@T[1],&DWP(0,"esp",@i[1],4));
&pxor ($R,$B30);
&psllq (@T[1],3*$n++);
&pxor ($R,@T[1]);
&movd (@T[0],&DWP(0,"esp",@i[0],4));
&pxor ($R,$B31);
&psllq (@T[0],3*$n);
&add ("esp",32+4);
&pxor ($R,@T[0]);
&ret ();
&function_end_B("_mul_1x1_mmx");
}
($lo,$hi)=("eax","edx");
@T=("ecx","ebp");
&function_begin_B("_mul_1x1_ialu");
&sub ("esp",32+4);
&mov ($a1,$a);
&lea ($a2,&DWP(0,$a,$a));
&lea ($a4,&DWP(0,"",$a,4));
&and ($a1,0x3fffffff);
&lea (@i[1],&DWP(0,$lo,$lo));
&sar ($lo,31); # broadcast 31st bit
&mov (&DWP(0*4,"esp"),0);
&and ($a2,0x7fffffff);
&mov (&DWP(1*4,"esp"),$a1); # a1
&xor ($a1,$a2); # a1^a2
&mov (&DWP(2*4,"esp"),$a2); # a2
&xor ($a2,$a4); # a2^a4
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
&mov (&DWP(4*4,"esp"),$a4); # a4
&xor ($a4,$a2); # a2=a4^a2^a4
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
&xor ($a4,$a1); # a1^a2^a4
&sar (@i[1],31); # broardcast 30th bit
&and ($lo,$b);
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
&and (@i[1],$b);
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
&mov ($hi,$lo);
&shl ($lo,31);
&mov (@T[0],@i[1]);
&shr ($hi,1);
&mov (@i[0],0x7);
&shl (@i[1],30);
&and (@i[0],$b);
&shr (@T[0],2);
&xor ($lo,@i[1]);
&shr ($b,3);
&mov (@i[1],0x7); # 5-byte instruction!?
&and (@i[1],$b);
&shr ($b,3);
&xor ($hi,@T[0]);
&xor ($lo,&DWP(0,"esp",@i[0],4));
&mov (@i[0],0x7);
&and (@i[0],$b);
&shr ($b,3);
for($n=1;$n<9;$n++) {
&mov (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@i[1],0x7);
&mov (@T[0],@T[1]);
&shl (@T[1],3*$n);
&and (@i[1],$b);
&shr (@T[0],32-3*$n);
&xor ($lo,@T[1]);
&shr ($b,3);
&xor ($hi,@T[0]);
push(@i,shift(@i)); push(@T,shift(@T));
}
&mov (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@T[0],@T[1]);
&shl (@T[1],3*$n);
&mov (@i[1],&DWP(0,"esp",@i[0],4));
&shr (@T[0],32-3*$n); $n++;
&mov (@i[0],@i[1]);
&xor ($lo,@T[1]);
&shl (@i[1],3*$n);
&xor ($hi,@T[0]);
&shr (@i[0],32-3*$n);
&xor ($lo,@i[1]);
&xor ($hi,@i[0]);
&add ("esp",32+4);
&ret ();
&function_end_B("_mul_1x1_ialu");
# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
&function_begin_B("bn_GF2m_mul_2x2");
if (!$x86only) {
&picmeup("edx","OPENSSL_ia32cap_P");
&mov ("eax",&DWP(0,"edx"));
&mov ("edx",&DWP(4,"edx"));
&test ("eax",1<<23); # check MMX bit
&jz (&label("ialu"));
if ($sse2) {
&test ("eax",1<<24); # check FXSR bit
&jz (&label("mmx"));
&test ("edx",1<<1); # check PCLMULQDQ bit
&jz (&label("mmx"));
&movups ("xmm0",&QWP(8,"esp"));
&shufps ("xmm0","xmm0",0b10110001);
&pclmulqdq ("xmm0","xmm0",1);
&mov ("eax",&DWP(4,"esp"));
&movups (&QWP(0,"eax"),"xmm0");
&ret ();
&set_label("mmx",16);
}
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_mmx"); # a1·b1
&movq ("mm7",$R);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # a0·b0
&movq ("mm6",$R);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
&pxor ($R,"mm7");
&mov ($a,&wparam(0));
&pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
&movq ($A,$R);
&psllq ($R,32);
&pop ("edi");
&psrlq ($A,32);
&pop ("esi");
&pxor ($R,"mm6");
&pop ("ebx");
&pxor ($A,"mm7");
&movq (&QWP(0,$a),$R);
&pop ("ebp");
&movq (&QWP(8,$a),$A);
&emms ();
&ret ();
&set_label("ialu",16);
}
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&stack_push(4+1);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_ialu"); # a1·b1
&mov (&DWP(8,"esp"),$lo);
&mov (&DWP(12,"esp"),$hi);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # a0·b0
&mov (&DWP(0,"esp"),$lo);
&mov (&DWP(4,"esp"),$hi);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
&mov ("ebp",&wparam(0));
@r=("ebx","ecx","edi","esi");
&mov (@r[0],&DWP(0,"esp"));
&mov (@r[1],&DWP(4,"esp"));
&mov (@r[2],&DWP(8,"esp"));
&mov (@r[3],&DWP(12,"esp"));
&xor ($lo,$hi);
&xor ($hi,@r[1]);
&xor ($lo,@r[0]);
&mov (&DWP(0,"ebp"),@r[0]);
&xor ($hi,@r[2]);
&mov (&DWP(12,"ebp"),@r[3]);
&xor ($lo,@r[3]);
&stack_pop(4+1);
&xor ($hi,@r[3]);
&pop ("edi");
&xor ($lo,$hi);
&pop ("esi");
&mov (&DWP(8,"ebp"),$hi);
&pop ("ebx");
&mov (&DWP(4,"ebp"),$lo);
&pop ("ebp");
&ret ();
&function_end_B("bn_GF2m_mul_2x2");
&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -0,0 +1,593 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&function_begin("bn_mul_mont");
$i="edx";
$j="ecx";
$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
$_num=&DWP(4*0,"esp"); # stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
&xor ("eax","eax");
&mov ("edi",&wparam(5)); # int num
&cmp ("edi",4);
&jl (&label("just_leave"));
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&mov ("ebp","esp"); # saved stack pointer!
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
&lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arraning 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov ("eax","esp");
&sub ("eax","edx");
&and ("eax",2047);
&sub ("esp","eax"); # this aligns sp and ap modulo 2048
&xor ("edx","esp");
&and ("edx",2048);
&xor ("edx",2048);
&sub ("esp","edx"); # this splits them apart modulo 4096
&and ("esp",-64); # align to cache line
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
&mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
&mov ($_np,"edx");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
&mov ($_sp,"ebp"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";
&picmeup("eax","OPENSSL_ia32cap_P");
&bt (&DWP(0,"eax"),26);
&jnc (&label("non_sse2"));
&mov ("eax",-1);
&movd ($mask,"eax"); # mask 32 lower bits
&mov ($ap,$_ap); # load input pointers
&mov ($bp,$_bp);
&mov ($np,$_np);
&xor ($i,$i); # i=0
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp)); # bp[0]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
&movq ($car0,$mul1);
&movq ($acc0,$mul1); # I wish movd worked for
&pand ($acc0,$mask); # inter-register transfers
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
&paddq ($car1,$acc0);
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&inc ($j); # j++
&set_label("1st",16);
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
&psrlq ($car1,32);
&lea ($j,&DWP(1,$j));
&cmp ($j,$num);
&jl (&label("1st"));
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car1,$car0);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&inc ($i); # i++
&set_label("outer");
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($temp,&DWP($frame,"esp")); # tp[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
&paddq ($mul1,$temp); # +=tp[0]
&movq ($acc0,$mul1);
&movq ($car0,$mul1);
&pand ($acc0,$mask);
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1);
&paddq ($car1,$acc0);
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[1]
&inc ($j); # j++
&dec ($num);
&set_label("inner");
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[j+1]
&dec ($num);
&lea ($j,&DWP(1,$j)); # j++
&jnz (&label("inner"));
&mov ($num,$j);
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
&paddq ($car1,$car0);
&paddq ($car1,$temp);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&lea ($i,&DWP(1,$i)); # i++
&cmp ($i,$num);
&jle (&label("outer"));
&emms (); # done with mmx bank
&jmp (&label("common_tail"));
&set_label("non_sse2",16);
}
if (0) {
&mov ("esp",$_sp);
&xor ("eax","eax"); # signal "not fast enough [yet]"
&jmp (&label("just_leave"));
# While the below code provides competitive performance for
# all key lengthes on modern Intel cores, it's still more
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
# means compared to the original integer-only assembler.
# 512-bit RSA sign is better by ~40%, but that's about all
# one can say about all CPUs...
} else {
$inp="esi"; # integer path uses these registers differently
$word="edi";
$carry="ebp";
&mov ($inp,$_ap);
&lea ($carry,&DWP(1,$num));
&mov ($word,$_bp);
&xor ($j,$j); # j=0
&mov ("edx",$inp);
&and ($carry,1); # see if num is even
&sub ("edx",$word); # see if ap==bp
&lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
&or ($carry,"edx");
&mov ($word,&DWP(0,$word)); # bp[0]
&jz (&label("bn_sqr_mont"));
&mov ($_bpend,"eax");
&mov ("eax",&DWP(0,$inp));
&xor ("edx","edx");
&set_label("mull",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[0]
&add ($carry,"eax");
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("mull"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[0]
&mov ($word,$_n0);
&add ("eax",$carry);
&mov ($inp,$_np);
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
&xor ($j,$j);
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mov ("eax",&DWP(0,$inp)); # np[0]
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&inc ($j);
&jmp (&label("2ndmadd"));
&set_label("1stmadd",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[i]
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("1stmadd"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[i]
&add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&add ($carry,"eax");
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&xor ($j,$j);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
&adc ($j,0);
&mov ("eax",&DWP(0,$inp)); # np[0]
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&mov ($j,1);
&set_label("2ndmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
&jl (&label("2ndmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&xor ("eax","eax");
&mov ($j,$_bp); # &bp[i]
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&lea ($j,&DWP(4,$j));
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$_bpend);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(0,$j)); # bp[i+1]
&mov ($inp,$_ap);
&mov ($_bp,$j); # &bp[++i]
&xor ($j,$j);
&xor ("edx","edx");
&mov ("eax",&DWP(0,$inp));
&jmp (&label("1stmadd"));
&set_label("bn_sqr_mont",16);
$sbit=$num;
&mov ($_num,$num);
&mov ($_bp,$j); # i=0
&mov ("eax",$word); # ap[0]
&mul ($word); # ap[0]*ap[0]
&mov (&DWP($frame,"esp"),"eax"); # tp[0]=
&mov ($sbit,"edx");
&shr ("edx",1);
&and ($sbit,1);
&inc ($j);
&set_label("sqr",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[0]
&add ("eax",$carry);
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&shr ("eax",31);
&cmp ($j,$_num);
&mov ($sbit,"eax");
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("sqr"));
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*ap[0]
&add ("eax",$carry);
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&shr ("eax",31);
&mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
&lea ($carry,&DWP(0,"eax","edx",2));
&mov ("eax",&DWP(0,$inp)); # np[0]
&shr ("edx",31);
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
&mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ($num,$j);
&adc ("edx",0);
&mov ("eax",&DWP(4,$inp)); # np[1]
&mov ($j,1);
&set_label("3rdmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
&mov ($carry,"edx");
&mul ($word); # np[j+1]*m
&add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
&lea ($j,&DWP(2,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
&jl (&label("3rdmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&mov ($j,$_bp); # i
&xor ("eax","eax");
&mov ($inp,$_ap);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$num);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
&lea ($j,&DWP(1,$j));
&mov ("eax",$word);
&mov ($_bp,$j); # ++i
&mul ($word); # ap[i]*ap[i]
&add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
&adc ("edx",0);
&mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
&xor ($carry,$carry);
&cmp ($j,$num);
&lea ($j,&DWP(1,$j));
&je (&label("sqrlast"));
&mov ($sbit,"edx"); # zaps $num
&shr ("edx",1);
&and ($sbit,1);
&set_label("sqradd",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[i]
&add ("eax",$carry);
&lea ($carry,&DWP(0,"eax","eax"));
&adc ("edx",0);
&shr ("eax",31);
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("eax",0);
&add ($carry,$sbit);
&adc ("eax",0);
&cmp ($j,$_num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&mov ($sbit,"eax");
&jle (&label("sqradd"));
&mov ($carry,"edx");
&add ("edx","edx");
&shr ($carry,31);
&add ("edx",$sbit);
&adc ($carry,0);
&set_label("sqrlast");
&mov ($word,$_n0);
&mov ($inp,$_np);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
&mov ("eax",&DWP(0,$inp)); # np[0]
&adc ($carry,0);
&mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&lea ($num,&DWP(-1,$j));
&adc ("edx",0);
&mov ($j,1);
&mov ("eax",&DWP(4,$inp)); # np[1]
&jmp (&label("3rdmadd"));
}
&set_label("common_tail",16);
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
&xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
&dec ($j); # doesn't affect CF!
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
&lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
&and ($tp,"eax");
&not ("eax");
&mov ($np,$rp);
&and ($np,"eax");
&or ($tp,$np); # tp=carry?tp:rp
&set_label("copy",16); # copy or in-place refresh
&mov ("eax",&DWP(0,$tp,$num,4));
&mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
&dec ($num);
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&set_label("just_leave");
&function_end("bn_mul_mont");
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -0,0 +1,28 @@
#!/usr/local/bin/perl
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
require("x86/mul_add.pl");
require("x86/mul.pl");
require("x86/sqr.pl");
require("x86/div.pl");
require("x86/add.pl");
require("x86/sub.pl");
require("x86/comba.pl");
&asm_init($ARGV[0],$0);
&bn_mul_add_words("bn_mul_add_words");
&bn_mul_words("bn_mul_words");
&bn_sqr_words("bn_sqr_words");
&bn_div_words("bn_div_words");
&bn_add_words("bn_add_words");
&bn_sub_words("bn_sub_words");
&bn_mul_comba("bn_mul_comba8",8);
&bn_mul_comba("bn_mul_comba4",4);
&bn_sqr_comba("bn_sqr_comba8",8);
&bn_sqr_comba("bn_sqr_comba4",4);
&asm_finish();

View File

@@ -0,0 +1,76 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *a
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
1;

View File

@@ -0,0 +1,277 @@
#!/usr/local/bin/perl
# x86 assember
sub mul_add_c
{
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("mul a[$ai]*b[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
&mul("edx");
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
sub sqr_add_c
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
sub sqr_add_c2
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$a,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add("eax","eax");
###
&adc("edx","edx");
###
&adc($c2,0);
&add($c0,"eax");
&adc($c1,"edx");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
&adc($c2,0);
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
###
}
sub bn_mul_comba
{
local($name,$num)=@_;
local($a,$b,$c0,$c1,$c2);
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($tot,$end);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
&push("esi");
&mov($a,&wparam(1));
&push("edi");
&mov($b,&wparam(2));
&push("ebp");
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($j+1) == $end)
{
$v=1;
$v=2 if (($i+1) == $tot);
}
else
{ $v=0; }
if (($j+1) != $end)
{
$na=($ai-1);
$nb=($bi+1);
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
if ($v)
{
&comment("saved r[$i]");
# &mov("eax",&wparam(0));
# &mov(&DWP($i*4,"eax","",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&comment("save r[$i]");
# &mov("eax",&wparam(0));
&mov(&DWP($i*4,"eax","",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
sub bn_sqr_comba
{
local($name,$num)=@_;
local($r,$a,$c0,$c1,$c2)=@_;
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($b,$tot,$end,$half);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$r="edi";
&push("esi");
&push("edi");
&push("ebp");
&push("ebx");
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&xor($c0,$c0);
&xor($c1,$c1);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("############### Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($ai-1) < ($bi+1))
{
$v=1;
$v=2 if ($i+1) == $tot;
}
else
{ $v=0; }
if (!$v)
{
$na=$ai-1;
$nb=$bi+1;
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
if ($ai == $bi)
{
&sqr_add_c($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
else
{
&sqr_add_c2($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
if ($v)
{
&comment("saved r[$i]");
#&mov(&DWP($i*4,$r,"",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
last;
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&mov(&DWP($i*4,$r,"",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
1;

View File

@@ -0,0 +1,15 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_div_words
{
local($name)=@_;
&function_begin($name,"");
&mov("edx",&wparam(0)); #
&mov("eax",&wparam(1)); #
&mov("ebx",&wparam(2)); #
&div("ebx");
&function_end($name);
}
1;

View File

@@ -0,0 +1,3 @@
#!/usr/local/bin/perl
# x86 assember

View File

@@ -0,0 +1,77 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_mul_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ecx";
$r="edi";
$c="esi";
$num="ebp";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&mov($w,&wparam(3)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("mw_finish"));
&set_label("mw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jz(&label("mw_finish"));
&jmp(&label("mw_loop"));
&set_label("mw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jnz(&label("mw_finish2"));
&jmp(&label("mw_end"));
&set_label("mw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
&mov($c,"edx"); # c= H(t);
&dec($num) if ($i != 7-1);
&jz(&label("mw_end")) if ($i != 7-1);
}
&set_label("mw_end",0);
&mov("eax",$c);
&function_end($name);
}
1;

View File

@@ -0,0 +1,87 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_mul_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ebp";
$r="edi";
$c="esi";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov("ecx",&wparam(2)); #
&mov($a,&wparam(1)); #
&and("ecx",0xfffffff8); # num / 8
&mov($w,&wparam(3)); #
&push("ecx"); # Up the stack for a tmp variable
&jz(&label("maw_finish"));
&set_label("maw_loop",0);
&mov(&swtmp(0),"ecx"); #
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+= *r
&mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&add("eax",$c); # L(t)+=c
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&mov("ecx",&swtmp(0)); #
&add($a,32);
&add($r,32);
&sub("ecx",8);
&jnz(&label("maw_loop"));
&set_label("maw_finish",0);
&mov("ecx",&wparam(2)); # get num
&and("ecx",7);
&jnz(&label("maw_finish2")); # helps branch prediction
&jmp(&label("maw_end"));
&set_label("maw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
&mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&add("eax",$c);
&adc("edx",0); # H(t)+=carry
&dec("ecx") if ($i != 7-1);
&mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
&jz(&label("maw_end")) if ($i != 7-1);
}
&set_label("maw_end",0);
&mov("eax",$c);
&pop("ecx"); # clear variable from
&function_end($name);
}
1;

View File

@@ -0,0 +1,60 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_sqr_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$r="esi";
$a="edi";
$num="ebx";
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("sw_finish"));
&set_label("sw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*2,$r,"",0),"eax"); #
&mov(&DWP($i*2+4,$r,"",0),"edx");#
}
&comment("");
&add($a,32);
&add($r,64);
&sub($num,8);
&jnz(&label("sw_loop"));
&set_label("sw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jz(&label("sw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*8,$r,"",0),"eax"); #
&dec($num) if ($i != 7-1);
&mov(&DWP($i*8+4,$r,"",0),"edx");
&jz(&label("sw_end")) if ($i != 7-1);
}
&set_label("sw_end",0);
&function_end($name);
}
1;

View File

@@ -0,0 +1,76 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_sub_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *a
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
1;

View File

@@ -0,0 +1,638 @@
#include "../bn_lcl.h"
#if !(defined(__GNUC__) && __GNUC__>=2)
# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
#else
/*-
* x86_64 BIGNUM accelerator version 0.1, December 2002.
*
* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
* project.
*
* Rights for redistribution and usage in source and binary forms are
* granted according to the OpenSSL license. Warranty of any kind is
* disclaimed.
*
* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
* versions, like 1.0...
* A. Well, that's because this code is basically a quick-n-dirty
* proof-of-concept hack. As you can see it's implemented with
* inline assembler, which means that you're bound to GCC and that
* there might be enough room for further improvement.
*
* Q. Why inline assembler?
* A. x86_64 features own ABI which I'm not familiar with. This is
* why I decided to let the compiler take care of subroutine
* prologue/epilogue as well as register allocation. For reference.
* Win64 implements different ABI for AMD64, different from Linux.
*
* Q. How much faster does it get?
* A. 'apps/openssl speed rsa dsa' output with no-asm:
*
* sign verify sign/s verify/s
* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
* sign verify sign/s verify/s
* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
*
* 'apps/openssl speed rsa dsa' output with this module:
*
* sign verify sign/s verify/s
* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
* sign verify sign/s verify/s
* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
*
* For the reference. IA-32 assembler implementation performs
* very much like 64-bit code compiled with no-asm on the same
* machine.
*/
# if defined(_WIN64) || !defined(__LP64__)
# define BN_ULONG unsigned long long
# else
# define BN_ULONG unsigned long
# endif
# undef mul
# undef mul_add
/*-
* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
* "g"(0) let the compiler to decide where does it
* want to keep the value of zero;
*/
# define mul_add(r,a,word,carry) do { \
register BN_ULONG high,low; \
asm ("mulq %3" \
: "=a"(low),"=d"(high) \
: "a"(word),"m"(a) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(carry),"+d"(high)\
: "a"(low),"g"(0) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+m"(r),"+d"(high) \
: "r"(carry),"g"(0) \
: "cc"); \
carry=high; \
} while (0)
# define mul(r,a,word,carry) do { \
register BN_ULONG high,low; \
asm ("mulq %3" \
: "=a"(low),"=d"(high) \
: "a"(word),"g"(a) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(carry),"+d"(high)\
: "a"(low),"g"(0) \
: "cc"); \
(r)=carry, carry=high; \
} while (0)
# undef sqr
# define sqr(r0,r1,a) \
asm ("mulq %2" \
: "=a"(r0),"=d"(r1) \
: "a"(a) \
: "cc");
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
BN_ULONG w)
{
BN_ULONG c1 = 0;
if (num <= 0)
return (c1);
while (num & ~3) {
mul_add(rp[0], ap[0], w, c1);
mul_add(rp[1], ap[1], w, c1);
mul_add(rp[2], ap[2], w, c1);
mul_add(rp[3], ap[3], w, c1);
ap += 4;
rp += 4;
num -= 4;
}
if (num) {
mul_add(rp[0], ap[0], w, c1);
if (--num == 0)
return c1;
mul_add(rp[1], ap[1], w, c1);
if (--num == 0)
return c1;
mul_add(rp[2], ap[2], w, c1);
return c1;
}
return (c1);
}
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1 = 0;
if (num <= 0)
return (c1);
while (num & ~3) {
mul(rp[0], ap[0], w, c1);
mul(rp[1], ap[1], w, c1);
mul(rp[2], ap[2], w, c1);
mul(rp[3], ap[3], w, c1);
ap += 4;
rp += 4;
num -= 4;
}
if (num) {
mul(rp[0], ap[0], w, c1);
if (--num == 0)
return c1;
mul(rp[1], ap[1], w, c1);
if (--num == 0)
return c1;
mul(rp[2], ap[2], w, c1);
}
return (c1);
}
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
{
if (n <= 0)
return;
while (n & ~3) {
sqr(r[0], r[1], a[0]);
sqr(r[2], r[3], a[1]);
sqr(r[4], r[5], a[2]);
sqr(r[6], r[7], a[3]);
a += 4;
r += 8;
n -= 4;
}
if (n) {
sqr(r[0], r[1], a[0]);
if (--n == 0)
return;
sqr(r[2], r[3], a[1]);
if (--n == 0)
return;
sqr(r[4], r[5], a[2]);
}
}
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
{
BN_ULONG ret, waste;
asm("divq %4":"=a"(ret), "=d"(waste)
: "a"(l), "d"(h), "g"(d)
: "cc");
return ret;
}
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int n)
{
BN_ULONG ret;
size_t i = 0;
if (n <= 0)
return 0;
asm volatile (" subq %0,%0 \n" /* clear carry */
" jmp 1f \n"
".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" adcq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" lea 1(%2),%2 \n"
" loop 1b \n"
" sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
"+r"(i)
:"r"(rp), "r"(ap), "r"(bp)
:"cc", "memory");
return ret & 1;
}
# ifndef SIMICS
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int n)
{
BN_ULONG ret;
size_t i = 0;
if (n <= 0)
return 0;
asm volatile (" subq %0,%0 \n" /* clear borrow */
" jmp 1f \n"
".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" sbbq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" lea 1(%2),%2 \n"
" loop 1b \n"
" sbbq %0,%0 \n":"=&r" (ret), "+c"(n),
"+r"(i)
:"r"(rp), "r"(ap), "r"(bp)
:"cc", "memory");
return ret & 1;
}
# else
/* Simics 1.4<7 has buggy sbbq:-( */
# define BN_MASK2 0xffffffffffffffffL
BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
{
BN_ULONG t1, t2;
int c = 0;
if (n <= 0)
return ((BN_ULONG)0);
for (;;) {
t1 = a[0];
t2 = b[0];
r[0] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
t1 = a[1];
t2 = b[1];
r[1] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
t1 = a[2];
t2 = b[2];
r[2] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
t1 = a[3];
t2 = b[3];
r[3] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
a += 4;
b += 4;
r += 4;
}
return (c);
}
# endif
/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
/*
* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
* c=(c2,c1,c0)
*/
/*
* Keep in mind that carrying into high part of multiplication result
* can not overflow, because it cannot be all-ones.
*/
# if 0
/* original macros are kept for reference purposes */
# define mul_add_c(a,b,c0,c1,c2) do { \
BN_ULONG ta = (a), tb = (b); \
BN_ULONG lo, hi; \
BN_UMULT_LOHI(lo,hi,ta,tb); \
c0 += lo; hi += (c0<lo)?1:0; \
c1 += hi; c2 += (c1<hi)?1:0; \
} while(0)
# define mul_add_c2(a,b,c0,c1,c2) do { \
BN_ULONG ta = (a), tb = (b); \
BN_ULONG lo, hi, tt; \
BN_UMULT_LOHI(lo,hi,ta,tb); \
c0 += lo; tt = hi+((c0<lo)?1:0); \
c1 += tt; c2 += (c1<tt)?1:0; \
c0 += lo; hi += (c0<lo)?1:0; \
c1 += hi; c2 += (c1<hi)?1:0; \
} while(0)
# define sqr_add_c(a,i,c0,c1,c2) do { \
BN_ULONG ta = (a)[i]; \
BN_ULONG lo, hi; \
BN_UMULT_LOHI(lo,hi,ta,ta); \
c0 += lo; hi += (c0<lo)?1:0; \
c1 += hi; c2 += (c1<hi)?1:0; \
} while(0)
# else
# define mul_add_c(a,b,c0,c1,c2) do { \
BN_ULONG t1,t2; \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
} while (0)
# define sqr_add_c(a,i,c0,c1,c2) do { \
BN_ULONG t1,t2; \
asm ("mulq %2" \
: "=a"(t1),"=d"(t2) \
: "a"(a[i]) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
} while (0)
# define mul_add_c2(a,b,c0,c1,c2) do { \
BN_ULONG t1,t2; \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
} while (0)
# endif
# define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
mul_add_c(a[0], b[0], c1, c2, c3);
r[0] = c1;
c1 = 0;
mul_add_c(a[0], b[1], c2, c3, c1);
mul_add_c(a[1], b[0], c2, c3, c1);
r[1] = c2;
c2 = 0;
mul_add_c(a[2], b[0], c3, c1, c2);
mul_add_c(a[1], b[1], c3, c1, c2);
mul_add_c(a[0], b[2], c3, c1, c2);
r[2] = c3;
c3 = 0;
mul_add_c(a[0], b[3], c1, c2, c3);
mul_add_c(a[1], b[2], c1, c2, c3);
mul_add_c(a[2], b[1], c1, c2, c3);
mul_add_c(a[3], b[0], c1, c2, c3);
r[3] = c1;
c1 = 0;
mul_add_c(a[4], b[0], c2, c3, c1);
mul_add_c(a[3], b[1], c2, c3, c1);
mul_add_c(a[2], b[2], c2, c3, c1);
mul_add_c(a[1], b[3], c2, c3, c1);
mul_add_c(a[0], b[4], c2, c3, c1);
r[4] = c2;
c2 = 0;
mul_add_c(a[0], b[5], c3, c1, c2);
mul_add_c(a[1], b[4], c3, c1, c2);
mul_add_c(a[2], b[3], c3, c1, c2);
mul_add_c(a[3], b[2], c3, c1, c2);
mul_add_c(a[4], b[1], c3, c1, c2);
mul_add_c(a[5], b[0], c3, c1, c2);
r[5] = c3;
c3 = 0;
mul_add_c(a[6], b[0], c1, c2, c3);
mul_add_c(a[5], b[1], c1, c2, c3);
mul_add_c(a[4], b[2], c1, c2, c3);
mul_add_c(a[3], b[3], c1, c2, c3);
mul_add_c(a[2], b[4], c1, c2, c3);
mul_add_c(a[1], b[5], c1, c2, c3);
mul_add_c(a[0], b[6], c1, c2, c3);
r[6] = c1;
c1 = 0;
mul_add_c(a[0], b[7], c2, c3, c1);
mul_add_c(a[1], b[6], c2, c3, c1);
mul_add_c(a[2], b[5], c2, c3, c1);
mul_add_c(a[3], b[4], c2, c3, c1);
mul_add_c(a[4], b[3], c2, c3, c1);
mul_add_c(a[5], b[2], c2, c3, c1);
mul_add_c(a[6], b[1], c2, c3, c1);
mul_add_c(a[7], b[0], c2, c3, c1);
r[7] = c2;
c2 = 0;
mul_add_c(a[7], b[1], c3, c1, c2);
mul_add_c(a[6], b[2], c3, c1, c2);
mul_add_c(a[5], b[3], c3, c1, c2);
mul_add_c(a[4], b[4], c3, c1, c2);
mul_add_c(a[3], b[5], c3, c1, c2);
mul_add_c(a[2], b[6], c3, c1, c2);
mul_add_c(a[1], b[7], c3, c1, c2);
r[8] = c3;
c3 = 0;
mul_add_c(a[2], b[7], c1, c2, c3);
mul_add_c(a[3], b[6], c1, c2, c3);
mul_add_c(a[4], b[5], c1, c2, c3);
mul_add_c(a[5], b[4], c1, c2, c3);
mul_add_c(a[6], b[3], c1, c2, c3);
mul_add_c(a[7], b[2], c1, c2, c3);
r[9] = c1;
c1 = 0;
mul_add_c(a[7], b[3], c2, c3, c1);
mul_add_c(a[6], b[4], c2, c3, c1);
mul_add_c(a[5], b[5], c2, c3, c1);
mul_add_c(a[4], b[6], c2, c3, c1);
mul_add_c(a[3], b[7], c2, c3, c1);
r[10] = c2;
c2 = 0;
mul_add_c(a[4], b[7], c3, c1, c2);
mul_add_c(a[5], b[6], c3, c1, c2);
mul_add_c(a[6], b[5], c3, c1, c2);
mul_add_c(a[7], b[4], c3, c1, c2);
r[11] = c3;
c3 = 0;
mul_add_c(a[7], b[5], c1, c2, c3);
mul_add_c(a[6], b[6], c1, c2, c3);
mul_add_c(a[5], b[7], c1, c2, c3);
r[12] = c1;
c1 = 0;
mul_add_c(a[6], b[7], c2, c3, c1);
mul_add_c(a[7], b[6], c2, c3, c1);
r[13] = c2;
c2 = 0;
mul_add_c(a[7], b[7], c3, c1, c2);
r[14] = c3;
r[15] = c1;
}
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
mul_add_c(a[0], b[0], c1, c2, c3);
r[0] = c1;
c1 = 0;
mul_add_c(a[0], b[1], c2, c3, c1);
mul_add_c(a[1], b[0], c2, c3, c1);
r[1] = c2;
c2 = 0;
mul_add_c(a[2], b[0], c3, c1, c2);
mul_add_c(a[1], b[1], c3, c1, c2);
mul_add_c(a[0], b[2], c3, c1, c2);
r[2] = c3;
c3 = 0;
mul_add_c(a[0], b[3], c1, c2, c3);
mul_add_c(a[1], b[2], c1, c2, c3);
mul_add_c(a[2], b[1], c1, c2, c3);
mul_add_c(a[3], b[0], c1, c2, c3);
r[3] = c1;
c1 = 0;
mul_add_c(a[3], b[1], c2, c3, c1);
mul_add_c(a[2], b[2], c2, c3, c1);
mul_add_c(a[1], b[3], c2, c3, c1);
r[4] = c2;
c2 = 0;
mul_add_c(a[2], b[3], c3, c1, c2);
mul_add_c(a[3], b[2], c3, c1, c2);
r[5] = c3;
c3 = 0;
mul_add_c(a[3], b[3], c1, c2, c3);
r[6] = c1;
r[7] = c2;
}
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
sqr_add_c(a, 0, c1, c2, c3);
r[0] = c1;
c1 = 0;
sqr_add_c2(a, 1, 0, c2, c3, c1);
r[1] = c2;
c2 = 0;
sqr_add_c(a, 1, c3, c1, c2);
sqr_add_c2(a, 2, 0, c3, c1, c2);
r[2] = c3;
c3 = 0;
sqr_add_c2(a, 3, 0, c1, c2, c3);
sqr_add_c2(a, 2, 1, c1, c2, c3);
r[3] = c1;
c1 = 0;
sqr_add_c(a, 2, c2, c3, c1);
sqr_add_c2(a, 3, 1, c2, c3, c1);
sqr_add_c2(a, 4, 0, c2, c3, c1);
r[4] = c2;
c2 = 0;
sqr_add_c2(a, 5, 0, c3, c1, c2);
sqr_add_c2(a, 4, 1, c3, c1, c2);
sqr_add_c2(a, 3, 2, c3, c1, c2);
r[5] = c3;
c3 = 0;
sqr_add_c(a, 3, c1, c2, c3);
sqr_add_c2(a, 4, 2, c1, c2, c3);
sqr_add_c2(a, 5, 1, c1, c2, c3);
sqr_add_c2(a, 6, 0, c1, c2, c3);
r[6] = c1;
c1 = 0;
sqr_add_c2(a, 7, 0, c2, c3, c1);
sqr_add_c2(a, 6, 1, c2, c3, c1);
sqr_add_c2(a, 5, 2, c2, c3, c1);
sqr_add_c2(a, 4, 3, c2, c3, c1);
r[7] = c2;
c2 = 0;
sqr_add_c(a, 4, c3, c1, c2);
sqr_add_c2(a, 5, 3, c3, c1, c2);
sqr_add_c2(a, 6, 2, c3, c1, c2);
sqr_add_c2(a, 7, 1, c3, c1, c2);
r[8] = c3;
c3 = 0;
sqr_add_c2(a, 7, 2, c1, c2, c3);
sqr_add_c2(a, 6, 3, c1, c2, c3);
sqr_add_c2(a, 5, 4, c1, c2, c3);
r[9] = c1;
c1 = 0;
sqr_add_c(a, 5, c2, c3, c1);
sqr_add_c2(a, 6, 4, c2, c3, c1);
sqr_add_c2(a, 7, 3, c2, c3, c1);
r[10] = c2;
c2 = 0;
sqr_add_c2(a, 7, 4, c3, c1, c2);
sqr_add_c2(a, 6, 5, c3, c1, c2);
r[11] = c3;
c3 = 0;
sqr_add_c(a, 6, c1, c2, c3);
sqr_add_c2(a, 7, 5, c1, c2, c3);
r[12] = c1;
c1 = 0;
sqr_add_c2(a, 7, 6, c2, c3, c1);
r[13] = c2;
c2 = 0;
sqr_add_c(a, 7, c3, c1, c2);
r[14] = c3;
r[15] = c1;
}
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
sqr_add_c(a, 0, c1, c2, c3);
r[0] = c1;
c1 = 0;
sqr_add_c2(a, 1, 0, c2, c3, c1);
r[1] = c2;
c2 = 0;
sqr_add_c(a, 1, c3, c1, c2);
sqr_add_c2(a, 2, 0, c3, c1, c2);
r[2] = c3;
c3 = 0;
sqr_add_c2(a, 3, 0, c1, c2, c3);
sqr_add_c2(a, 2, 1, c1, c2, c3);
r[3] = c1;
c1 = 0;
sqr_add_c(a, 2, c2, c3, c1);
sqr_add_c2(a, 3, 1, c2, c3, c1);
r[4] = c2;
c2 = 0;
sqr_add_c2(a, 3, 2, c3, c1, c2);
r[5] = c3;
c3 = 0;
sqr_add_c(a, 3, c1, c2, c3);
r[6] = c1;
r[7] = c2;
}
#endif

View File

@@ -0,0 +1,390 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: code suitable
# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
# later. Improvement varies from one benchmark and µ-arch to another.
# Vanilla code path is at most 20% faster than compiler-generated code
# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
# all CPU time is burnt in it...
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
($lo,$hi)=("%rax","%rdx"); $a=$lo;
($i0,$i1)=("%rsi","%rdi");
($t0,$t1)=("%rbx","%rcx");
($b,$mask)=("%rbp","%r8");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
($R,$Tx)=("%xmm0","%xmm1");
$code.=<<___;
.text
.type _mul_1x1,\@abi-omnipotent
.align 16
_mul_1x1:
sub \$128+8,%rsp
mov \$-1,$a1
lea ($a,$a),$i0
shr \$3,$a1
lea (,$a,4),$i1
and $a,$a1 # a1=a&0x1fffffffffffffff
lea (,$a,8),$a8
sar \$63,$a # broadcast 63rd bit
lea ($a1,$a1),$a2
sar \$63,$i0 # broadcast 62nd bit
lea (,$a1,4),$a4
and $b,$a
sar \$63,$i1 # boardcast 61st bit
mov $a,$hi # $a is $lo
shl \$63,$lo
and $b,$i0
shr \$1,$hi
mov $i0,$t1
shl \$62,$i0
and $b,$i1
shr \$2,$t1
xor $i0,$lo
mov $i1,$t0
shl \$61,$i1
xor $t1,$hi
shr \$3,$t0
xor $i1,$lo
xor $t0,$hi
mov $a1,$a12
movq \$0,0(%rsp) # tab[0]=0
xor $a2,$a12 # a1^a2
mov $a1,8(%rsp) # tab[1]=a1
mov $a4,$a48
mov $a2,16(%rsp) # tab[2]=a2
xor $a8,$a48 # a4^a8
mov $a12,24(%rsp) # tab[3]=a1^a2
xor $a4,$a1
mov $a4,32(%rsp) # tab[4]=a4
xor $a4,$a2
mov $a1,40(%rsp) # tab[5]=a1^a4
xor $a4,$a12
mov $a2,48(%rsp) # tab[6]=a2^a4
xor $a48,$a1 # a1^a4^a4^a8=a1^a8
mov $a12,56(%rsp) # tab[7]=a1^a2^a4
xor $a48,$a2 # a2^a4^a4^a8=a1^a8
mov $a8,64(%rsp) # tab[8]=a8
xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
mov $a1,72(%rsp) # tab[9]=a1^a8
xor $a4,$a1 # a1^a8^a4
mov $a2,80(%rsp) # tab[10]=a2^a8
xor $a4,$a2 # a2^a8^a4
mov $a12,88(%rsp) # tab[11]=a1^a2^a8
xor $a4,$a12 # a1^a2^a8^a4
mov $a48,96(%rsp) # tab[12]=a4^a8
mov $mask,$i0
mov $a1,104(%rsp) # tab[13]=a1^a4^a8
and $b,$i0
mov $a2,112(%rsp) # tab[14]=a2^a4^a8
shr \$4,$b
mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
mov $mask,$i1
and $b,$i1
shr \$4,$b
movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
mov $mask,$i0
and $b,$i0
shr \$4,$b
___
for ($n=1;$n<8;$n++) {
$code.=<<___;
mov (%rsp,$i1,8),$t1
mov $mask,$i1
mov $t1,$t0
shl \$`8*$n-4`,$t1
and $b,$i1
movq (%rsp,$i0,8),$Tx
shr \$`64-(8*$n-4)`,$t0
xor $t1,$lo
pslldq \$$n,$Tx
mov $mask,$i0
shr \$4,$b
xor $t0,$hi
and $b,$i0
shr \$4,$b
pxor $Tx,$R
___
}
$code.=<<___;
mov (%rsp,$i1,8),$t1
mov $t1,$t0
shl \$`8*$n-4`,$t1
movq $R,$i0
shr \$`64-(8*$n-4)`,$t0
xor $t1,$lo
psrldq \$8,$R
xor $t0,$hi
movq $R,$i1
xor $i0,$lo
xor $i1,$hi
add \$128+8,%rsp
ret
.Lend_mul_1x1:
.size _mul_1x1,.-_mul_1x1
___
($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
$code.=<<___;
.extern OPENSSL_ia32cap_P
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,\@abi-omnipotent
.align 16
bn_GF2m_mul_2x2:
mov OPENSSL_ia32cap_P(%rip),%rax
bt \$33,%rax
jnc .Lvanilla_mul_2x2
movq $a1,%xmm0
movq $b1,%xmm1
movq $a0,%xmm2
___
$code.=<<___ if ($win64);
movq 40(%rsp),%xmm3
___
$code.=<<___ if (!$win64);
movq $b0,%xmm3
___
$code.=<<___;
movdqa %xmm0,%xmm4
movdqa %xmm1,%xmm5
pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
pxor %xmm2,%xmm4
pxor %xmm3,%xmm5
pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
xorps %xmm0,%xmm4
xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
movdqa %xmm4,%xmm5
pslldq \$8,%xmm4
psrldq \$8,%xmm5
pxor %xmm4,%xmm2
pxor %xmm5,%xmm0
movdqu %xmm2,0($rp)
movdqu %xmm0,16($rp)
ret
.align 16
.Lvanilla_mul_2x2:
lea -8*17(%rsp),%rsp
___
$code.=<<___ if ($win64);
mov `8*17+40`(%rsp),$b0
mov %rdi,8*15(%rsp)
mov %rsi,8*16(%rsp)
___
$code.=<<___;
mov %r14,8*10(%rsp)
mov %r13,8*11(%rsp)
mov %r12,8*12(%rsp)
mov %rbp,8*13(%rsp)
mov %rbx,8*14(%rsp)
.Lbody_mul_2x2:
mov $rp,32(%rsp) # save the arguments
mov $a1,40(%rsp)
mov $a0,48(%rsp)
mov $b1,56(%rsp)
mov $b0,64(%rsp)
mov \$0xf,$mask
mov $a1,$a
mov $b1,$b
call _mul_1x1 # a1·b1
mov $lo,16(%rsp)
mov $hi,24(%rsp)
mov 48(%rsp),$a
mov 64(%rsp),$b
call _mul_1x1 # a0·b0
mov $lo,0(%rsp)
mov $hi,8(%rsp)
mov 40(%rsp),$a
mov 56(%rsp),$b
xor 48(%rsp),$a
xor 64(%rsp),$b
call _mul_1x1 # (a0+a1)·(b0+b1)
___
@r=("%rbx","%rcx","%rdi","%rsi");
$code.=<<___;
mov 0(%rsp),@r[0]
mov 8(%rsp),@r[1]
mov 16(%rsp),@r[2]
mov 24(%rsp),@r[3]
mov 32(%rsp),%rbp
xor $hi,$lo
xor @r[1],$hi
xor @r[0],$lo
mov @r[0],0(%rbp)
xor @r[2],$hi
mov @r[3],24(%rbp)
xor @r[3],$lo
xor @r[3],$hi
xor $hi,$lo
mov $hi,16(%rbp)
mov $lo,8(%rbp)
mov 8*10(%rsp),%r14
mov 8*11(%rsp),%r13
mov 8*12(%rsp),%r12
mov 8*13(%rsp),%rbp
mov 8*14(%rsp),%rbx
___
$code.=<<___ if ($win64);
mov 8*15(%rsp),%rdi
mov 8*16(%rsp),%rsi
___
$code.=<<___;
lea 8*17(%rsp),%rsp
ret
.Lend_mul_2x2:
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 16
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
mov 248($context),%rbx # pull context->Rip
lea .Lbody_mul_2x2(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lin_prologue
mov 8*10(%rax),%r14 # mimic epilogue
mov 8*11(%rax),%r13
mov 8*12(%rax),%r12
mov 8*13(%rax),%rbp
mov 8*14(%rax),%rbx
mov 8*15(%rax),%rdi
mov 8*16(%rax),%rsi
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
.Lin_prologue:
lea 8*17(%rax),%rax
mov %rax,152($context) # restore context->Rsp
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
.section .pdata
.align 4
.rva _mul_1x1
.rva .Lend_mul_1x1
.rva .LSEH_info_1x1
.rva .Lvanilla_mul_2x2
.rva .Lend_mul_2x2
.rva .LSEH_info_2x2
.section .xdata
.align 8
.LSEH_info_1x1:
.byte 0x01,0x07,0x02,0x00
.byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
.LSEH_info_2x2:
.byte 9,0,0,0
.rva se_handler
___
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,939 @@
/* crypto/bn/bn.h */
/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* ====================================================================
* Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
/* ====================================================================
* Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
*
* Portions of the attached software ("Contribution") are developed by
* SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
*
* The Contribution is licensed pursuant to the Eric Young open source
* license provided above.
*
* The binary polynomial arithmetic software is originally written by
* Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems Laboratories.
*
*/
#ifndef HEADER_BN_H
# define HEADER_BN_H
# include <openssl/e_os2.h>
# ifndef OPENSSL_NO_FP_API
# include <stdio.h> /* FILE */
# endif
# include <openssl/ossl_typ.h>
# include <openssl/crypto.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* These preprocessor symbols control various aspects of the bignum headers
* and library code. They're not defined by any "normal" configuration, as
* they are intended for development and testing purposes. NB: defining all
* three can be useful for debugging application code as well as openssl
* itself. BN_DEBUG - turn on various debugging alterations to the bignum
* code BN_DEBUG_RAND - uses random poisoning of unused words to trip up
* mismanagement of bignum internals. You must also define BN_DEBUG.
*/
/* #define BN_DEBUG */
/* #define BN_DEBUG_RAND */
# ifndef OPENSSL_SMALL_FOOTPRINT
# define BN_MUL_COMBA
# define BN_SQR_COMBA
# define BN_RECURSION
# endif
/*
* This next option uses the C libraries (2 word)/(1 word) function. If it is
* not defined, I use my C version (which is slower). The reason for this
* flag is that when the particular C compiler library routine is used, and
* the library is linked with a different compiler, the library is missing.
* This mostly happens when the library is built with gcc and then linked
* using normal cc. This would be a common occurrence because gcc normally
* produces code that is 2 times faster than system compilers for the big
* number stuff. For machines with only one compiler (or shared libraries),
* this should be on. Again this in only really a problem on machines using
* "long long's", are 32bit, and are not using my assembler code.
*/
# if defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_WINDOWS) || \
defined(OPENSSL_SYS_WIN32) || defined(linux)
# ifndef BN_DIV2W
# define BN_DIV2W
# endif
# endif
/*
* assuming long is 64bit - this is the DEC Alpha unsigned long long is only
* 64 bits :-(, don't define BN_LLONG for the DEC Alpha
*/
# ifdef SIXTY_FOUR_BIT_LONG
# define BN_ULLONG unsigned long long
# define BN_ULONG unsigned long
# define BN_LONG long
# define BN_BITS 128
# define BN_BYTES 8
# define BN_BITS2 64
# define BN_BITS4 32
# define BN_MASK (0xffffffffffffffffffffffffffffffffLL)
# define BN_MASK2 (0xffffffffffffffffL)
# define BN_MASK2l (0xffffffffL)
# define BN_MASK2h (0xffffffff00000000L)
# define BN_MASK2h1 (0xffffffff80000000L)
# define BN_TBIT (0x8000000000000000L)
# define BN_DEC_CONV (10000000000000000000UL)
# define BN_DEC_FMT1 "%lu"
# define BN_DEC_FMT2 "%019lu"
# define BN_DEC_NUM 19
# define BN_HEX_FMT1 "%lX"
# define BN_HEX_FMT2 "%016lX"
# endif
/*
* This is where the long long data type is 64 bits, but long is 32. For
* machines where there are 64bit registers, this is the mode to use. IRIX,
* on R4000 and above should use this mode, along with the relevant assembler
* code :-). Do NOT define BN_LLONG.
*/
# ifdef SIXTY_FOUR_BIT
# undef BN_LLONG
# undef BN_ULLONG
# define BN_ULONG unsigned long long
# define BN_LONG long long
# define BN_BITS 128
# define BN_BYTES 8
# define BN_BITS2 64
# define BN_BITS4 32
# define BN_MASK2 (0xffffffffffffffffLL)
# define BN_MASK2l (0xffffffffL)
# define BN_MASK2h (0xffffffff00000000LL)
# define BN_MASK2h1 (0xffffffff80000000LL)
# define BN_TBIT (0x8000000000000000LL)
# define BN_DEC_CONV (10000000000000000000ULL)
# define BN_DEC_FMT1 "%llu"
# define BN_DEC_FMT2 "%019llu"
# define BN_DEC_NUM 19
# define BN_HEX_FMT1 "%llX"
# define BN_HEX_FMT2 "%016llX"
# endif
# ifdef THIRTY_TWO_BIT
# ifdef BN_LLONG
# if defined(_WIN32) && !defined(__GNUC__)
# define BN_ULLONG unsigned __int64
# define BN_MASK (0xffffffffffffffffI64)
# else
# define BN_ULLONG unsigned long long
# define BN_MASK (0xffffffffffffffffLL)
# endif
# endif
# define BN_ULONG unsigned int
# define BN_LONG int
# define BN_BITS 64
# define BN_BYTES 4
# define BN_BITS2 32
# define BN_BITS4 16
# define BN_MASK2 (0xffffffffL)
# define BN_MASK2l (0xffff)
# define BN_MASK2h1 (0xffff8000L)
# define BN_MASK2h (0xffff0000L)
# define BN_TBIT (0x80000000L)
# define BN_DEC_CONV (1000000000L)
# define BN_DEC_FMT1 "%u"
# define BN_DEC_FMT2 "%09u"
# define BN_DEC_NUM 9
# define BN_HEX_FMT1 "%X"
# define BN_HEX_FMT2 "%08X"
# endif
# define BN_DEFAULT_BITS 1280
# define BN_FLG_MALLOCED 0x01
# define BN_FLG_STATIC_DATA 0x02
/*
* avoid leaking exponent information through timing,
* BN_mod_exp_mont() will call BN_mod_exp_mont_consttime,
* BN_div() will call BN_div_no_branch,
* BN_mod_inverse() will call BN_mod_inverse_no_branch.
*/
# define BN_FLG_CONSTTIME 0x04
# ifdef OPENSSL_NO_DEPRECATED
/* deprecated name for the flag */
# define BN_FLG_EXP_CONSTTIME BN_FLG_CONSTTIME
/*
* avoid leaking exponent information through timings
* (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime)
*/
# endif
# ifndef OPENSSL_NO_DEPRECATED
# define BN_FLG_FREE 0x8000
/* used for debuging */
# endif
# define BN_set_flags(b,n) ((b)->flags|=(n))
# define BN_get_flags(b,n) ((b)->flags&(n))
/*
* get a clone of a BIGNUM with changed flags, for *temporary* use only (the
* two BIGNUMs cannot not be used in parallel!)
*/
# define BN_with_flags(dest,b,n) ((dest)->d=(b)->d, \
(dest)->top=(b)->top, \
(dest)->dmax=(b)->dmax, \
(dest)->neg=(b)->neg, \
(dest)->flags=(((dest)->flags & BN_FLG_MALLOCED) \
| ((b)->flags & ~BN_FLG_MALLOCED) \
| BN_FLG_STATIC_DATA \
| (n)))
/* Already declared in ossl_typ.h */
# if 0
typedef struct bignum_st BIGNUM;
/* Used for temp variables (declaration hidden in bn_lcl.h) */
typedef struct bignum_ctx BN_CTX;
typedef struct bn_blinding_st BN_BLINDING;
typedef struct bn_mont_ctx_st BN_MONT_CTX;
typedef struct bn_recp_ctx_st BN_RECP_CTX;
typedef struct bn_gencb_st BN_GENCB;
# endif
struct bignum_st {
BN_ULONG *d; /* Pointer to an array of 'BN_BITS2' bit
* chunks. */
int top; /* Index of last used d +1. */
/* The next are internal book keeping for bn_expand. */
int dmax; /* Size of the d array. */
int neg; /* one if the number is negative */
int flags;
};
/* Used for montgomery multiplication */
struct bn_mont_ctx_st {
int ri; /* number of bits in R */
BIGNUM RR; /* used to convert to montgomery form */
BIGNUM N; /* The modulus */
BIGNUM Ni; /* R*(1/R mod N) - N*Ni = 1 (Ni is only
* stored for bignum algorithm) */
BN_ULONG n0[2]; /* least significant word(s) of Ni; (type
* changed with 0.9.9, was "BN_ULONG n0;"
* before) */
int flags;
};
/*
* Used for reciprocal division/mod functions It cannot be shared between
* threads
*/
struct bn_recp_ctx_st {
BIGNUM N; /* the divisor */
BIGNUM Nr; /* the reciprocal */
int num_bits;
int shift;
int flags;
};
/* Used for slow "generation" functions. */
struct bn_gencb_st {
unsigned int ver; /* To handle binary (in)compatibility */
void *arg; /* callback-specific data */
union {
/* if(ver==1) - handles old style callbacks */
void (*cb_1) (int, int, void *);
/* if(ver==2) - new callback style */
int (*cb_2) (int, int, BN_GENCB *);
} cb;
};
/* Wrapper function to make using BN_GENCB easier, */
int BN_GENCB_call(BN_GENCB *cb, int a, int b);
/* Macro to populate a BN_GENCB structure with an "old"-style callback */
# define BN_GENCB_set_old(gencb, callback, cb_arg) { \
BN_GENCB *tmp_gencb = (gencb); \
tmp_gencb->ver = 1; \
tmp_gencb->arg = (cb_arg); \
tmp_gencb->cb.cb_1 = (callback); }
/* Macro to populate a BN_GENCB structure with a "new"-style callback */
# define BN_GENCB_set(gencb, callback, cb_arg) { \
BN_GENCB *tmp_gencb = (gencb); \
tmp_gencb->ver = 2; \
tmp_gencb->arg = (cb_arg); \
tmp_gencb->cb.cb_2 = (callback); }
# define BN_prime_checks 0 /* default: select number of iterations based
* on the size of the number */
/*
* number of Miller-Rabin iterations for an error rate of less than 2^-80 for
* random 'b'-bit input, b >= 100 (taken from table 4.4 in the Handbook of
* Applied Cryptography [Menezes, van Oorschot, Vanstone; CRC Press 1996];
* original paper: Damgaard, Landrock, Pomerance: Average case error
* estimates for the strong probable prime test. -- Math. Comp. 61 (1993)
* 177-194)
*/
# define BN_prime_checks_for_size(b) ((b) >= 1300 ? 2 : \
(b) >= 850 ? 3 : \
(b) >= 650 ? 4 : \
(b) >= 550 ? 5 : \
(b) >= 450 ? 6 : \
(b) >= 400 ? 7 : \
(b) >= 350 ? 8 : \
(b) >= 300 ? 9 : \
(b) >= 250 ? 12 : \
(b) >= 200 ? 15 : \
(b) >= 150 ? 18 : \
/* b >= 100 */ 27)
# define BN_num_bytes(a) ((BN_num_bits(a)+7)/8)
/* Note that BN_abs_is_word didn't work reliably for w == 0 until 0.9.8 */
# define BN_abs_is_word(a,w) ((((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w))) || \
(((w) == 0) && ((a)->top == 0)))
# define BN_is_zero(a) ((a)->top == 0)
# define BN_is_one(a) (BN_abs_is_word((a),1) && !(a)->neg)
# define BN_is_word(a,w) (BN_abs_is_word((a),(w)) && (!(w) || !(a)->neg))
# define BN_is_odd(a) (((a)->top > 0) && ((a)->d[0] & 1))
# define BN_one(a) (BN_set_word((a),1))
# define BN_zero_ex(a) \
do { \
BIGNUM *_tmp_bn = (a); \
_tmp_bn->top = 0; \
_tmp_bn->neg = 0; \
} while(0)
# ifdef OPENSSL_NO_DEPRECATED
# define BN_zero(a) BN_zero_ex(a)
# else
# define BN_zero(a) (BN_set_word((a),0))
# endif
const BIGNUM *BN_value_one(void);
char *BN_options(void);
BN_CTX *BN_CTX_new(void);
# ifndef OPENSSL_NO_DEPRECATED
void BN_CTX_init(BN_CTX *c);
# endif
void BN_CTX_free(BN_CTX *c);
void BN_CTX_start(BN_CTX *ctx);
BIGNUM *BN_CTX_get(BN_CTX *ctx);
void BN_CTX_end(BN_CTX *ctx);
int BN_rand(BIGNUM *rnd, int bits, int top, int bottom);
int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom);
int BN_rand_range(BIGNUM *rnd, const BIGNUM *range);
int BN_pseudo_rand_range(BIGNUM *rnd, const BIGNUM *range);
int BN_num_bits(const BIGNUM *a);
int BN_num_bits_word(BN_ULONG);
BIGNUM *BN_new(void);
void BN_init(BIGNUM *);
void BN_clear_free(BIGNUM *a);
BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b);
void BN_swap(BIGNUM *a, BIGNUM *b);
BIGNUM *BN_bin2bn(const unsigned char *s, int len, BIGNUM *ret);
int BN_bn2bin(const BIGNUM *a, unsigned char *to);
BIGNUM *BN_mpi2bn(const unsigned char *s, int len, BIGNUM *ret);
int BN_bn2mpi(const BIGNUM *a, unsigned char *to);
int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
/** BN_set_negative sets sign of a BIGNUM
* \param b pointer to the BIGNUM object
* \param n 0 if the BIGNUM b should be positive and a value != 0 otherwise
*/
void BN_set_negative(BIGNUM *b, int n);
/** BN_is_negative returns 1 if the BIGNUM is negative
* \param a pointer to the BIGNUM object
* \return 1 if a < 0 and 0 otherwise
*/
# define BN_is_negative(a) ((a)->neg != 0)
int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
BN_CTX *ctx);
# define BN_mod(rem,m,d,ctx) BN_div(NULL,(rem),(m),(d),(ctx))
int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx);
int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx);
int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m);
int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx);
int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m);
int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx);
int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m);
int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
BN_CTX *ctx);
int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m);
BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w);
BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w);
int BN_mul_word(BIGNUM *a, BN_ULONG w);
int BN_add_word(BIGNUM *a, BN_ULONG w);
int BN_sub_word(BIGNUM *a, BN_ULONG w);
int BN_set_word(BIGNUM *a, BN_ULONG w);
BN_ULONG BN_get_word(const BIGNUM *a);
int BN_cmp(const BIGNUM *a, const BIGNUM *b);
void BN_free(BIGNUM *a);
int BN_is_bit_set(const BIGNUM *a, int n);
int BN_lshift(BIGNUM *r, const BIGNUM *a, int n);
int BN_lshift1(BIGNUM *r, const BIGNUM *a);
int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx);
int BN_mod_exp_mont(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx,
BN_MONT_CTX *in_mont);
int BN_mod_exp_mont_word(BIGNUM *r, BN_ULONG a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
int BN_mod_exp2_mont(BIGNUM *r, const BIGNUM *a1, const BIGNUM *p1,
const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
BN_CTX *ctx, BN_MONT_CTX *m_ctx);
int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx);
int BN_mask_bits(BIGNUM *a, int n);
# ifndef OPENSSL_NO_FP_API
int BN_print_fp(FILE *fp, const BIGNUM *a);
# endif
# ifdef HEADER_BIO_H
int BN_print(BIO *fp, const BIGNUM *a);
# else
int BN_print(void *fp, const BIGNUM *a);
# endif
int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx);
int BN_rshift(BIGNUM *r, const BIGNUM *a, int n);
int BN_rshift1(BIGNUM *r, const BIGNUM *a);
void BN_clear(BIGNUM *a);
BIGNUM *BN_dup(const BIGNUM *a);
int BN_ucmp(const BIGNUM *a, const BIGNUM *b);
int BN_set_bit(BIGNUM *a, int n);
int BN_clear_bit(BIGNUM *a, int n);
char *BN_bn2hex(const BIGNUM *a);
char *BN_bn2dec(const BIGNUM *a);
int BN_hex2bn(BIGNUM **a, const char *str);
int BN_dec2bn(BIGNUM **a, const char *str);
int BN_asc2bn(BIGNUM **a, const char *str);
int BN_gcd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
int BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx); /* returns
* -2 for
* error */
BIGNUM *BN_mod_inverse(BIGNUM *ret,
const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
BIGNUM *BN_mod_sqrt(BIGNUM *ret,
const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
void BN_consttime_swap(BN_ULONG swap, BIGNUM *a, BIGNUM *b, int nwords);
/* Deprecated versions */
# ifndef OPENSSL_NO_DEPRECATED
BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe,
const BIGNUM *add, const BIGNUM *rem,
void (*callback) (int, int, void *), void *cb_arg);
int BN_is_prime(const BIGNUM *p, int nchecks,
void (*callback) (int, int, void *),
BN_CTX *ctx, void *cb_arg);
int BN_is_prime_fasttest(const BIGNUM *p, int nchecks,
void (*callback) (int, int, void *), BN_CTX *ctx,
void *cb_arg, int do_trial_division);
# endif /* !defined(OPENSSL_NO_DEPRECATED) */
/* Newer versions */
int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
const BIGNUM *rem, BN_GENCB *cb);
int BN_is_prime_ex(const BIGNUM *p, int nchecks, BN_CTX *ctx, BN_GENCB *cb);
int BN_is_prime_fasttest_ex(const BIGNUM *p, int nchecks, BN_CTX *ctx,
int do_trial_division, BN_GENCB *cb);
int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
const BIGNUM *Xp, const BIGNUM *Xp1,
const BIGNUM *Xp2, const BIGNUM *e, BN_CTX *ctx,
BN_GENCB *cb);
int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2, BIGNUM *Xp1,
BIGNUM *Xp2, const BIGNUM *Xp, const BIGNUM *e,
BN_CTX *ctx, BN_GENCB *cb);
BN_MONT_CTX *BN_MONT_CTX_new(void);
void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
BN_MONT_CTX *mont, BN_CTX *ctx);
# define BN_to_montgomery(r,a,mont,ctx) BN_mod_mul_montgomery(\
(r),(a),&((mont)->RR),(mont),(ctx))
int BN_from_montgomery(BIGNUM *r, const BIGNUM *a,
BN_MONT_CTX *mont, BN_CTX *ctx);
void BN_MONT_CTX_free(BN_MONT_CTX *mont);
int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx);
BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from);
BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
const BIGNUM *mod, BN_CTX *ctx);
/* BN_BLINDING flags */
# define BN_BLINDING_NO_UPDATE 0x00000001
# define BN_BLINDING_NO_RECREATE 0x00000002
BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod);
void BN_BLINDING_free(BN_BLINDING *b);
int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx);
int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *);
int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b,
BN_CTX *);
# ifndef OPENSSL_NO_DEPRECATED
unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *);
void BN_BLINDING_set_thread_id(BN_BLINDING *, unsigned long);
# endif
CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *);
unsigned long BN_BLINDING_get_flags(const BN_BLINDING *);
void BN_BLINDING_set_flags(BN_BLINDING *, unsigned long);
BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
int (*bn_mod_exp) (BIGNUM *r,
const BIGNUM *a,
const BIGNUM *p,
const BIGNUM *m,
BN_CTX *ctx,
BN_MONT_CTX *m_ctx),
BN_MONT_CTX *m_ctx);
# ifndef OPENSSL_NO_DEPRECATED
void BN_set_params(int mul, int high, int low, int mont);
int BN_get_params(int which); /* 0, mul, 1 high, 2 low, 3 mont */
# endif
void BN_RECP_CTX_init(BN_RECP_CTX *recp);
BN_RECP_CTX *BN_RECP_CTX_new(void);
void BN_RECP_CTX_free(BN_RECP_CTX *recp);
int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *rdiv, BN_CTX *ctx);
int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
BN_RECP_CTX *recp, BN_CTX *ctx);
int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx);
int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
BN_RECP_CTX *recp, BN_CTX *ctx);
# ifndef OPENSSL_NO_EC2M
/*
* Functions for arithmetic over binary polynomials represented by BIGNUMs.
* The BIGNUM::neg property of BIGNUMs representing binary polynomials is
* ignored. Note that input arguments are not const so that their bit arrays
* can be expanded to the appropriate size if needed.
*/
/*
* r = a + b
*/
int BN_GF2m_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
# define BN_GF2m_sub(r, a, b) BN_GF2m_add(r, a, b)
/*
* r=a mod p
*/
int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p);
/* r = (a * b) mod p */
int BN_GF2m_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *p, BN_CTX *ctx);
/* r = (a * a) mod p */
int BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
/* r = (1 / b) mod p */
int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *b, const BIGNUM *p, BN_CTX *ctx);
/* r = (a / b) mod p */
int BN_GF2m_mod_div(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *p, BN_CTX *ctx);
/* r = (a ^ b) mod p */
int BN_GF2m_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *p, BN_CTX *ctx);
/* r = sqrt(a) mod p */
int BN_GF2m_mod_sqrt(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
BN_CTX *ctx);
/* r^2 + r = a mod p */
int BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
BN_CTX *ctx);
# define BN_GF2m_cmp(a, b) BN_ucmp((a), (b))
/*-
* Some functions allow for representation of the irreducible polynomials
* as an unsigned int[], say p. The irreducible f(t) is then of the form:
* t^p[0] + t^p[1] + ... + t^p[k]
* where m = p[0] > p[1] > ... > p[k] = 0.
*/
/* r = a mod p */
int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]);
/* r = (a * b) mod p */
int BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const int p[], BN_CTX *ctx);
/* r = (a * a) mod p */
int BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[],
BN_CTX *ctx);
/* r = (1 / b) mod p */
int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *b, const int p[],
BN_CTX *ctx);
/* r = (a / b) mod p */
int BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const int p[], BN_CTX *ctx);
/* r = (a ^ b) mod p */
int BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const int p[], BN_CTX *ctx);
/* r = sqrt(a) mod p */
int BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a,
const int p[], BN_CTX *ctx);
/* r^2 + r = a mod p */
int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
const int p[], BN_CTX *ctx);
int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
int BN_GF2m_arr2poly(const int p[], BIGNUM *a);
# endif
/*
* faster mod functions for the 'NIST primes' 0 <= a < p^2
*/
int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
const BIGNUM *BN_get0_nist_prime_192(void);
const BIGNUM *BN_get0_nist_prime_224(void);
const BIGNUM *BN_get0_nist_prime_256(void);
const BIGNUM *BN_get0_nist_prime_384(void);
const BIGNUM *BN_get0_nist_prime_521(void);
/* library internal functions */
# define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
(a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2))
# define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
BIGNUM *bn_expand2(BIGNUM *a, int words);
# ifndef OPENSSL_NO_DEPRECATED
BIGNUM *bn_dup_expand(const BIGNUM *a, int words); /* unused */
# endif
/*-
* Bignum consistency macros
* There is one "API" macro, bn_fix_top(), for stripping leading zeroes from
* bignum data after direct manipulations on the data. There is also an
* "internal" macro, bn_check_top(), for verifying that there are no leading
* zeroes. Unfortunately, some auditing is required due to the fact that
* bn_fix_top() has become an overabused duct-tape because bignum data is
* occasionally passed around in an inconsistent state. So the following
* changes have been made to sort this out;
* - bn_fix_top()s implementation has been moved to bn_correct_top()
* - if BN_DEBUG isn't defined, bn_fix_top() maps to bn_correct_top(), and
* bn_check_top() is as before.
* - if BN_DEBUG *is* defined;
* - bn_check_top() tries to pollute unused words even if the bignum 'top' is
* consistent. (ed: only if BN_DEBUG_RAND is defined)
* - bn_fix_top() maps to bn_check_top() rather than "fixing" anything.
* The idea is to have debug builds flag up inconsistent bignums when they
* occur. If that occurs in a bn_fix_top(), we examine the code in question; if
* the use of bn_fix_top() was appropriate (ie. it follows directly after code
* that manipulates the bignum) it is converted to bn_correct_top(), and if it
* was not appropriate, we convert it permanently to bn_check_top() and track
* down the cause of the bug. Eventually, no internal code should be using the
* bn_fix_top() macro. External applications and libraries should try this with
* their own code too, both in terms of building against the openssl headers
* with BN_DEBUG defined *and* linking with a version of OpenSSL built with it
* defined. This not only improves external code, it provides more test
* coverage for openssl's own code.
*/
# ifdef BN_DEBUG
/* We only need assert() when debugging */
# include <assert.h>
# ifdef BN_DEBUG_RAND
/* To avoid "make update" cvs wars due to BN_DEBUG, use some tricks */
# ifndef RAND_pseudo_bytes
int RAND_pseudo_bytes(unsigned char *buf, int num);
# define BN_DEBUG_TRIX
# endif
# define bn_pollute(a) \
do { \
const BIGNUM *_bnum1 = (a); \
if(_bnum1->top < _bnum1->dmax) { \
unsigned char _tmp_char; \
/* We cast away const without the compiler knowing, any \
* *genuinely* constant variables that aren't mutable \
* wouldn't be constructed with top!=dmax. */ \
BN_ULONG *_not_const; \
memcpy(&_not_const, &_bnum1->d, sizeof(BN_ULONG*)); \
/* Debug only - safe to ignore error return */ \
RAND_pseudo_bytes(&_tmp_char, 1); \
memset((unsigned char *)(_not_const + _bnum1->top), _tmp_char, \
(_bnum1->dmax - _bnum1->top) * sizeof(BN_ULONG)); \
} \
} while(0)
# ifdef BN_DEBUG_TRIX
# undef RAND_pseudo_bytes
# endif
# else
# define bn_pollute(a)
# endif
# define bn_check_top(a) \
do { \
const BIGNUM *_bnum2 = (a); \
if (_bnum2 != NULL) { \
assert((_bnum2->top == 0) || \
(_bnum2->d[_bnum2->top - 1] != 0)); \
bn_pollute(_bnum2); \
} \
} while(0)
# define bn_fix_top(a) bn_check_top(a)
# define bn_check_size(bn, bits) bn_wcheck_size(bn, ((bits+BN_BITS2-1))/BN_BITS2)
# define bn_wcheck_size(bn, words) \
do { \
const BIGNUM *_bnum2 = (bn); \
assert((words) <= (_bnum2)->dmax && (words) >= (_bnum2)->top); \
/* avoid unused variable warning with NDEBUG */ \
(void)(_bnum2); \
} while(0)
# else /* !BN_DEBUG */
# define bn_pollute(a)
# define bn_check_top(a)
# define bn_fix_top(a) bn_correct_top(a)
# define bn_check_size(bn, bits)
# define bn_wcheck_size(bn, words)
# endif
# define bn_correct_top(a) \
{ \
BN_ULONG *ftl; \
int tmp_top = (a)->top; \
if (tmp_top > 0) \
{ \
for (ftl= &((a)->d[tmp_top-1]); tmp_top > 0; tmp_top--) \
if (*(ftl--)) break; \
(a)->top = tmp_top; \
} \
bn_pollute(a); \
}
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
BN_ULONG w);
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int num);
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int num);
/* Primes from RFC 2409 */
BIGNUM *get_rfc2409_prime_768(BIGNUM *bn);
BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn);
/* Primes from RFC 3526 */
BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn);
BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn);
BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn);
BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn);
BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn);
BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn);
int BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom);
/* BEGIN ERROR CODES */
/*
* The following lines are auto generated by the script mkerr.pl. Any changes
* made after this point may be overwritten when the script is next run.
*/
void ERR_load_BN_strings(void);
/* Error codes for the BN functions. */
/* Function codes. */
# define BN_F_BNRAND 127
# define BN_F_BN_BLINDING_CONVERT_EX 100
# define BN_F_BN_BLINDING_CREATE_PARAM 128
# define BN_F_BN_BLINDING_INVERT_EX 101
# define BN_F_BN_BLINDING_NEW 102
# define BN_F_BN_BLINDING_UPDATE 103
# define BN_F_BN_BN2DEC 104
# define BN_F_BN_BN2HEX 105
# define BN_F_BN_CTX_GET 116
# define BN_F_BN_CTX_NEW 106
# define BN_F_BN_CTX_START 129
# define BN_F_BN_DIV 107
# define BN_F_BN_DIV_NO_BRANCH 138
# define BN_F_BN_DIV_RECP 130
# define BN_F_BN_EXP 123
# define BN_F_BN_EXPAND2 108
# define BN_F_BN_EXPAND_INTERNAL 120
# define BN_F_BN_GF2M_MOD 131
# define BN_F_BN_GF2M_MOD_EXP 132
# define BN_F_BN_GF2M_MOD_MUL 133
# define BN_F_BN_GF2M_MOD_SOLVE_QUAD 134
# define BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR 135
# define BN_F_BN_GF2M_MOD_SQR 136
# define BN_F_BN_GF2M_MOD_SQRT 137
# define BN_F_BN_LSHIFT 145
# define BN_F_BN_MOD_EXP2_MONT 118
# define BN_F_BN_MOD_EXP_MONT 109
# define BN_F_BN_MOD_EXP_MONT_CONSTTIME 124
# define BN_F_BN_MOD_EXP_MONT_WORD 117
# define BN_F_BN_MOD_EXP_RECP 125
# define BN_F_BN_MOD_EXP_SIMPLE 126
# define BN_F_BN_MOD_INVERSE 110
# define BN_F_BN_MOD_INVERSE_NO_BRANCH 139
# define BN_F_BN_MOD_LSHIFT_QUICK 119
# define BN_F_BN_MOD_MUL_RECIPROCAL 111
# define BN_F_BN_MOD_SQRT 121
# define BN_F_BN_MPI2BN 112
# define BN_F_BN_NEW 113
# define BN_F_BN_RAND 114
# define BN_F_BN_RAND_RANGE 122
# define BN_F_BN_RSHIFT 146
# define BN_F_BN_USUB 115
/* Reason codes. */
# define BN_R_ARG2_LT_ARG3 100
# define BN_R_BAD_RECIPROCAL 101
# define BN_R_BIGNUM_TOO_LONG 114
# define BN_R_BITS_TOO_SMALL 118
# define BN_R_CALLED_WITH_EVEN_MODULUS 102
# define BN_R_DIV_BY_ZERO 103
# define BN_R_ENCODING_ERROR 104
# define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA 105
# define BN_R_INPUT_NOT_REDUCED 110
# define BN_R_INVALID_LENGTH 106
# define BN_R_INVALID_RANGE 115
# define BN_R_INVALID_SHIFT 119
# define BN_R_NOT_A_SQUARE 111
# define BN_R_NOT_INITIALIZED 107
# define BN_R_NO_INVERSE 108
# define BN_R_NO_SOLUTION 116
# define BN_R_P_IS_NOT_PRIME 112
# define BN_R_TOO_MANY_ITERATIONS 113
# define BN_R_TOO_MANY_TEMPORARY_VARIABLES 109
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,19 @@
We need
* bn_mul_comba8
* bn_mul_comba4
* bn_mul_normal
* bn_mul_recursive
* bn_sqr_comba8
* bn_sqr_comba4
bn_sqr_normal -> BN_sqr
* bn_sqr_recursive
* bn_mul_low_recursive
* bn_mul_low_normal
* bn_mul_high
* bn_mul_part_recursive # symetric but not power of 2
bn_mul_asymetric_recursive # uneven, but do the chop up.

View File

@@ -0,0 +1,313 @@
/* crypto/bn/bn_add.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include "cryptlib.h"
#include "bn_lcl.h"
/* r can == a or b */
int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
{
const BIGNUM *tmp;
int a_neg = a->neg, ret;
bn_check_top(a);
bn_check_top(b);
/*-
* a + b a+b
* a + -b a-b
* -a + b b-a
* -a + -b -(a+b)
*/
if (a_neg ^ b->neg) {
/* only one is negative */
if (a_neg) {
tmp = a;
a = b;
b = tmp;
}
/* we are now a - b */
if (BN_ucmp(a, b) < 0) {
if (!BN_usub(r, b, a))
return (0);
r->neg = 1;
} else {
if (!BN_usub(r, a, b))
return (0);
r->neg = 0;
}
return (1);
}
ret = BN_uadd(r, a, b);
r->neg = a_neg;
bn_check_top(r);
return ret;
}
/* unsigned add of b to a */
int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
{
int max, min, dif;
BN_ULONG *ap, *bp, *rp, carry, t1, t2;
const BIGNUM *tmp;
bn_check_top(a);
bn_check_top(b);
if (a->top < b->top) {
tmp = a;
a = b;
b = tmp;
}
max = a->top;
min = b->top;
dif = max - min;
if (bn_wexpand(r, max + 1) == NULL)
return 0;
r->top = max;
ap = a->d;
bp = b->d;
rp = r->d;
carry = bn_add_words(rp, ap, bp, min);
rp += min;
ap += min;
bp += min;
if (carry) {
while (dif) {
dif--;
t1 = *(ap++);
t2 = (t1 + 1) & BN_MASK2;
*(rp++) = t2;
if (t2) {
carry = 0;
break;
}
}
if (carry) {
/* carry != 0 => dif == 0 */
*rp = 1;
r->top++;
}
}
if (dif && rp != ap)
while (dif--)
/* copy remaining words if ap != rp */
*(rp++) = *(ap++);
r->neg = 0;
bn_check_top(r);
return 1;
}
/* unsigned subtraction of b from a, a must be larger than b. */
int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
{
int max, min, dif;
register BN_ULONG t1, t2, *ap, *bp, *rp;
int i, carry;
#if defined(IRIX_CC_BUG) && !defined(LINT)
int dummy;
#endif
bn_check_top(a);
bn_check_top(b);
max = a->top;
min = b->top;
dif = max - min;
if (dif < 0) { /* hmm... should not be happening */
BNerr(BN_F_BN_USUB, BN_R_ARG2_LT_ARG3);
return (0);
}
if (bn_wexpand(r, max) == NULL)
return (0);
ap = a->d;
bp = b->d;
rp = r->d;
#if 1
carry = 0;
for (i = min; i != 0; i--) {
t1 = *(ap++);
t2 = *(bp++);
if (carry) {
carry = (t1 <= t2);
t1 = (t1 - t2 - 1) & BN_MASK2;
} else {
carry = (t1 < t2);
t1 = (t1 - t2) & BN_MASK2;
}
# if defined(IRIX_CC_BUG) && !defined(LINT)
dummy = t1;
# endif
*(rp++) = t1 & BN_MASK2;
}
#else
carry = bn_sub_words(rp, ap, bp, min);
ap += min;
bp += min;
rp += min;
#endif
if (carry) { /* subtracted */
if (!dif)
/* error: a < b */
return 0;
while (dif) {
dif--;
t1 = *(ap++);
t2 = (t1 - 1) & BN_MASK2;
*(rp++) = t2;
if (t1)
break;
}
}
#if 0
memcpy(rp, ap, sizeof(*rp) * (max - i));
#else
if (rp != ap) {
for (;;) {
if (!dif--)
break;
rp[0] = ap[0];
if (!dif--)
break;
rp[1] = ap[1];
if (!dif--)
break;
rp[2] = ap[2];
if (!dif--)
break;
rp[3] = ap[3];
rp += 4;
ap += 4;
}
}
#endif
r->top = max;
r->neg = 0;
bn_correct_top(r);
return (1);
}
int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
{
int max;
int add = 0, neg = 0;
const BIGNUM *tmp;
bn_check_top(a);
bn_check_top(b);
/*-
* a - b a-b
* a - -b a+b
* -a - b -(a+b)
* -a - -b b-a
*/
if (a->neg) {
if (b->neg) {
tmp = a;
a = b;
b = tmp;
} else {
add = 1;
neg = 1;
}
} else {
if (b->neg) {
add = 1;
neg = 0;
}
}
if (add) {
if (!BN_uadd(r, a, b))
return (0);
r->neg = neg;
return (1);
}
/* We are actually doing a - b :-) */
max = (a->top > b->top) ? a->top : b->top;
if (bn_wexpand(r, max) == NULL)
return (0);
if (BN_ucmp(a, b) < 0) {
if (!BN_usub(r, b, a))
return (0);
r->neg = 1;
} else {
if (!BN_usub(r, a, b))
return (0);
r->neg = 0;
}
bn_check_top(r);
return (1);
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,385 @@
/* crypto/bn/bn_blind.c */
/* ====================================================================
* Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include "cryptlib.h"
#include "bn_lcl.h"
#define BN_BLINDING_COUNTER 32
struct bn_blinding_st {
BIGNUM *A;
BIGNUM *Ai;
BIGNUM *e;
BIGNUM *mod; /* just a reference */
#ifndef OPENSSL_NO_DEPRECATED
unsigned long thread_id; /* added in OpenSSL 0.9.6j and 0.9.7b; used
* only by crypto/rsa/rsa_eay.c, rsa_lib.c */
#endif
CRYPTO_THREADID tid;
int counter;
unsigned long flags;
BN_MONT_CTX *m_ctx;
int (*bn_mod_exp) (BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
};
BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
{
BN_BLINDING *ret = NULL;
bn_check_top(mod);
if ((ret = (BN_BLINDING *)OPENSSL_malloc(sizeof(BN_BLINDING))) == NULL) {
BNerr(BN_F_BN_BLINDING_NEW, ERR_R_MALLOC_FAILURE);
return (NULL);
}
memset(ret, 0, sizeof(BN_BLINDING));
if (A != NULL) {
if ((ret->A = BN_dup(A)) == NULL)
goto err;
}
if (Ai != NULL) {
if ((ret->Ai = BN_dup(Ai)) == NULL)
goto err;
}
/* save a copy of mod in the BN_BLINDING structure */
if ((ret->mod = BN_dup(mod)) == NULL)
goto err;
if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
/*
* Set the counter to the special value -1 to indicate that this is
* never-used fresh blinding that does not need updating before first
* use.
*/
ret->counter = -1;
CRYPTO_THREADID_current(&ret->tid);
return (ret);
err:
if (ret != NULL)
BN_BLINDING_free(ret);
return (NULL);
}
void BN_BLINDING_free(BN_BLINDING *r)
{
if (r == NULL)
return;
if (r->A != NULL)
BN_free(r->A);
if (r->Ai != NULL)
BN_free(r->Ai);
if (r->e != NULL)
BN_free(r->e);
if (r->mod != NULL)
BN_free(r->mod);
OPENSSL_free(r);
}
int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
{
int ret = 0;
if ((b->A == NULL) || (b->Ai == NULL)) {
BNerr(BN_F_BN_BLINDING_UPDATE, BN_R_NOT_INITIALIZED);
goto err;
}
if (b->counter == -1)
b->counter = 0;
if (++b->counter == BN_BLINDING_COUNTER && b->e != NULL &&
!(b->flags & BN_BLINDING_NO_RECREATE)) {
/* re-create blinding parameters */
if (!BN_BLINDING_create_param(b, NULL, NULL, ctx, NULL, NULL))
goto err;
} else if (!(b->flags & BN_BLINDING_NO_UPDATE)) {
if (!BN_mod_mul(b->A, b->A, b->A, b->mod, ctx))
goto err;
if (!BN_mod_mul(b->Ai, b->Ai, b->Ai, b->mod, ctx))
goto err;
}
ret = 1;
err:
if (b->counter == BN_BLINDING_COUNTER)
b->counter = 0;
return (ret);
}
int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
{
return BN_BLINDING_convert_ex(n, NULL, b, ctx);
}
int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
{
int ret = 1;
bn_check_top(n);
if ((b->A == NULL) || (b->Ai == NULL)) {
BNerr(BN_F_BN_BLINDING_CONVERT_EX, BN_R_NOT_INITIALIZED);
return (0);
}
if (b->counter == -1)
/* Fresh blinding, doesn't need updating. */
b->counter = 0;
else if (!BN_BLINDING_update(b, ctx))
return (0);
if (r != NULL) {
if (!BN_copy(r, b->Ai))
ret = 0;
}
if (!BN_mod_mul(n, n, b->A, b->mod, ctx))
ret = 0;
return ret;
}
int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
{
return BN_BLINDING_invert_ex(n, NULL, b, ctx);
}
int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b,
BN_CTX *ctx)
{
int ret;
bn_check_top(n);
if (r != NULL)
ret = BN_mod_mul(n, n, r, b->mod, ctx);
else {
if (b->Ai == NULL) {
BNerr(BN_F_BN_BLINDING_INVERT_EX, BN_R_NOT_INITIALIZED);
return (0);
}
ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
}
bn_check_top(n);
return (ret);
}
#ifndef OPENSSL_NO_DEPRECATED
unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *b)
{
return b->thread_id;
}
void BN_BLINDING_set_thread_id(BN_BLINDING *b, unsigned long n)
{
b->thread_id = n;
}
#endif
CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *b)
{
return &b->tid;
}
unsigned long BN_BLINDING_get_flags(const BN_BLINDING *b)
{
return b->flags;
}
void BN_BLINDING_set_flags(BN_BLINDING *b, unsigned long flags)
{
b->flags = flags;
}
BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
int (*bn_mod_exp) (BIGNUM *r,
const BIGNUM *a,
const BIGNUM *p,
const BIGNUM *m,
BN_CTX *ctx,
BN_MONT_CTX *m_ctx),
BN_MONT_CTX *m_ctx)
{
int retry_counter = 32;
BN_BLINDING *ret = NULL;
if (b == NULL)
ret = BN_BLINDING_new(NULL, NULL, m);
else
ret = b;
if (ret == NULL)
goto err;
if (ret->A == NULL && (ret->A = BN_new()) == NULL)
goto err;
if (ret->Ai == NULL && (ret->Ai = BN_new()) == NULL)
goto err;
if (e != NULL) {
if (ret->e != NULL)
BN_free(ret->e);
ret->e = BN_dup(e);
}
if (ret->e == NULL)
goto err;
if (bn_mod_exp != NULL)
ret->bn_mod_exp = bn_mod_exp;
if (m_ctx != NULL)
ret->m_ctx = m_ctx;
do {
if (!BN_rand_range(ret->A, ret->mod))
goto err;
if (BN_mod_inverse(ret->Ai, ret->A, ret->mod, ctx) == NULL) {
/*
* this should almost never happen for good RSA keys
*/
unsigned long error = ERR_peek_last_error();
if (ERR_GET_REASON(error) == BN_R_NO_INVERSE) {
if (retry_counter-- == 0) {
BNerr(BN_F_BN_BLINDING_CREATE_PARAM,
BN_R_TOO_MANY_ITERATIONS);
goto err;
}
ERR_clear_error();
} else
goto err;
} else
break;
} while (1);
if (ret->bn_mod_exp != NULL && ret->m_ctx != NULL) {
if (!ret->bn_mod_exp
(ret->A, ret->A, ret->e, ret->mod, ctx, ret->m_ctx))
goto err;
} else {
if (!BN_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx))
goto err;
}
return ret;
err:
if (b == NULL && ret != NULL) {
BN_BLINDING_free(ret);
ret = NULL;
}
return ret;
}

Binary file not shown.

View File

@@ -0,0 +1,547 @@
/* crypto/bn/knownprimes.c */
/* Insert boilerplate */
#include "bn.h"
/*-
* "First Oakley Default Group" from RFC2409, section 6.1.
*
* The prime is: 2^768 - 2 ^704 - 1 + 2^64 * { [2^638 pi] + 149686 }
*
* RFC2409 specifies a generator of 2.
* RFC2412 specifies a generator of of 22.
*/
BIGNUM *get_rfc2409_prime_768(BIGNUM *bn)
{
static const unsigned char RFC2409_PRIME_768[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x3A, 0x36, 0x20,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC2409_PRIME_768, sizeof(RFC2409_PRIME_768), bn);
}
/*-
* "Second Oakley Default Group" from RFC2409, section 6.2.
*
* The prime is: 2^1024 - 2^960 - 1 + 2^64 * { [2^894 pi] + 129093 }.
*
* RFC2409 specifies a generator of 2.
* RFC2412 specifies a generator of 22.
*/
BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn)
{
static const unsigned char RFC2409_PRIME_1024[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE6, 0x53, 0x81,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC2409_PRIME_1024, sizeof(RFC2409_PRIME_1024), bn);
}
/*-
* "1536-bit MODP Group" from RFC3526, Section 2.
*
* The prime is: 2^1536 - 2^1472 - 1 + 2^64 * { [2^1406 pi] + 741804 }
*
* RFC3526 specifies a generator of 2.
* RFC2312 specifies a generator of 22.
*/
BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_1536[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x23, 0x73, 0x27,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_1536, sizeof(RFC3526_PRIME_1536), bn);
}
/*-
* "2048-bit MODP Group" from RFC3526, Section 3.
*
* The prime is: 2^2048 - 2^1984 - 1 + 2^64 * { [2^1918 pi] + 124476 }
*
* RFC3526 specifies a generator of 2.
*/
BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_2048[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAC, 0xAA, 0x68,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_2048, sizeof(RFC3526_PRIME_2048), bn);
}
/*-
* "3072-bit MODP Group" from RFC3526, Section 4.
*
* The prime is: 2^3072 - 2^3008 - 1 + 2^64 * { [2^2942 pi] + 1690314 }
*
* RFC3526 specifies a generator of 2.
*/
BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_3072[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x3A, 0xD2, 0xCA,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_3072, sizeof(RFC3526_PRIME_3072), bn);
}
/*-
* "4096-bit MODP Group" from RFC3526, Section 5.
*
* The prime is: 2^4096 - 2^4032 - 1 + 2^64 * { [2^3966 pi] + 240904 }
*
* RFC3526 specifies a generator of 2.
*/
BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_4096[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x21, 0x08, 0x01,
0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26,
0x99, 0xC3, 0x27, 0x18, 0x6A, 0xF4, 0xE2, 0x3C,
0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8,
0xDB, 0xBB, 0xC2, 0xDB, 0x04, 0xDE, 0x8E, 0xF9,
0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D,
0x99, 0xB2, 0x96, 0x4F, 0xA0, 0x90, 0xC3, 0xA2,
0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF,
0xB8, 0x1B, 0xDD, 0x76, 0x21, 0x70, 0x48, 0x1C,
0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1,
0x86, 0xFF, 0xB7, 0xDC, 0x90, 0xA6, 0xC0, 0x8F,
0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x06, 0x31, 0x99,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_4096, sizeof(RFC3526_PRIME_4096), bn);
}
/*-
* "6144-bit MODP Group" from RFC3526, Section 6.
*
* The prime is: 2^6144 - 2^6080 - 1 + 2^64 * { [2^6014 pi] + 929484 }
*
* RFC3526 specifies a generator of 2.
*/
BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_6144[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x21, 0x08, 0x01,
0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26,
0x99, 0xC3, 0x27, 0x18, 0x6A, 0xF4, 0xE2, 0x3C,
0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8,
0xDB, 0xBB, 0xC2, 0xDB, 0x04, 0xDE, 0x8E, 0xF9,
0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D,
0x99, 0xB2, 0x96, 0x4F, 0xA0, 0x90, 0xC3, 0xA2,
0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF,
0xB8, 0x1B, 0xDD, 0x76, 0x21, 0x70, 0x48, 0x1C,
0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1,
0x86, 0xFF, 0xB7, 0xDC, 0x90, 0xA6, 0xC0, 0x8F,
0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x02, 0x84, 0x92,
0x36, 0xC3, 0xFA, 0xB4, 0xD2, 0x7C, 0x70, 0x26,
0xC1, 0xD4, 0xDC, 0xB2, 0x60, 0x26, 0x46, 0xDE,
0xC9, 0x75, 0x1E, 0x76, 0x3D, 0xBA, 0x37, 0xBD,
0xF8, 0xFF, 0x94, 0x06, 0xAD, 0x9E, 0x53, 0x0E,
0xE5, 0xDB, 0x38, 0x2F, 0x41, 0x30, 0x01, 0xAE,
0xB0, 0x6A, 0x53, 0xED, 0x90, 0x27, 0xD8, 0x31,
0x17, 0x97, 0x27, 0xB0, 0x86, 0x5A, 0x89, 0x18,
0xDA, 0x3E, 0xDB, 0xEB, 0xCF, 0x9B, 0x14, 0xED,
0x44, 0xCE, 0x6C, 0xBA, 0xCE, 0xD4, 0xBB, 0x1B,
0xDB, 0x7F, 0x14, 0x47, 0xE6, 0xCC, 0x25, 0x4B,
0x33, 0x20, 0x51, 0x51, 0x2B, 0xD7, 0xAF, 0x42,
0x6F, 0xB8, 0xF4, 0x01, 0x37, 0x8C, 0xD2, 0xBF,
0x59, 0x83, 0xCA, 0x01, 0xC6, 0x4B, 0x92, 0xEC,
0xF0, 0x32, 0xEA, 0x15, 0xD1, 0x72, 0x1D, 0x03,
0xF4, 0x82, 0xD7, 0xCE, 0x6E, 0x74, 0xFE, 0xF6,
0xD5, 0x5E, 0x70, 0x2F, 0x46, 0x98, 0x0C, 0x82,
0xB5, 0xA8, 0x40, 0x31, 0x90, 0x0B, 0x1C, 0x9E,
0x59, 0xE7, 0xC9, 0x7F, 0xBE, 0xC7, 0xE8, 0xF3,
0x23, 0xA9, 0x7A, 0x7E, 0x36, 0xCC, 0x88, 0xBE,
0x0F, 0x1D, 0x45, 0xB7, 0xFF, 0x58, 0x5A, 0xC5,
0x4B, 0xD4, 0x07, 0xB2, 0x2B, 0x41, 0x54, 0xAA,
0xCC, 0x8F, 0x6D, 0x7E, 0xBF, 0x48, 0xE1, 0xD8,
0x14, 0xCC, 0x5E, 0xD2, 0x0F, 0x80, 0x37, 0xE0,
0xA7, 0x97, 0x15, 0xEE, 0xF2, 0x9B, 0xE3, 0x28,
0x06, 0xA1, 0xD5, 0x8B, 0xB7, 0xC5, 0xDA, 0x76,
0xF5, 0x50, 0xAA, 0x3D, 0x8A, 0x1F, 0xBF, 0xF0,
0xEB, 0x19, 0xCC, 0xB1, 0xA3, 0x13, 0xD5, 0x5C,
0xDA, 0x56, 0xC9, 0xEC, 0x2E, 0xF2, 0x96, 0x32,
0x38, 0x7F, 0xE8, 0xD7, 0x6E, 0x3C, 0x04, 0x68,
0x04, 0x3E, 0x8F, 0x66, 0x3F, 0x48, 0x60, 0xEE,
0x12, 0xBF, 0x2D, 0x5B, 0x0B, 0x74, 0x74, 0xD6,
0xE6, 0x94, 0xF9, 0x1E, 0x6D, 0xCC, 0x40, 0x24,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_6144, sizeof(RFC3526_PRIME_6144), bn);
}
/*-
* "8192-bit MODP Group" from RFC3526, Section 7.
*
* The prime is: 2^8192 - 2^8128 - 1 + 2^64 * { [2^8062 pi] + 4743158 }
*
* RFC3526 specifies a generator of 2.
*/
BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_8192[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x21, 0x08, 0x01,
0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26,
0x99, 0xC3, 0x27, 0x18, 0x6A, 0xF4, 0xE2, 0x3C,
0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8,
0xDB, 0xBB, 0xC2, 0xDB, 0x04, 0xDE, 0x8E, 0xF9,
0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D,
0x99, 0xB2, 0x96, 0x4F, 0xA0, 0x90, 0xC3, 0xA2,
0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF,
0xB8, 0x1B, 0xDD, 0x76, 0x21, 0x70, 0x48, 0x1C,
0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1,
0x86, 0xFF, 0xB7, 0xDC, 0x90, 0xA6, 0xC0, 0x8F,
0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x02, 0x84, 0x92,
0x36, 0xC3, 0xFA, 0xB4, 0xD2, 0x7C, 0x70, 0x26,
0xC1, 0xD4, 0xDC, 0xB2, 0x60, 0x26, 0x46, 0xDE,
0xC9, 0x75, 0x1E, 0x76, 0x3D, 0xBA, 0x37, 0xBD,
0xF8, 0xFF, 0x94, 0x06, 0xAD, 0x9E, 0x53, 0x0E,
0xE5, 0xDB, 0x38, 0x2F, 0x41, 0x30, 0x01, 0xAE,
0xB0, 0x6A, 0x53, 0xED, 0x90, 0x27, 0xD8, 0x31,
0x17, 0x97, 0x27, 0xB0, 0x86, 0x5A, 0x89, 0x18,
0xDA, 0x3E, 0xDB, 0xEB, 0xCF, 0x9B, 0x14, 0xED,
0x44, 0xCE, 0x6C, 0xBA, 0xCE, 0xD4, 0xBB, 0x1B,
0xDB, 0x7F, 0x14, 0x47, 0xE6, 0xCC, 0x25, 0x4B,
0x33, 0x20, 0x51, 0x51, 0x2B, 0xD7, 0xAF, 0x42,
0x6F, 0xB8, 0xF4, 0x01, 0x37, 0x8C, 0xD2, 0xBF,
0x59, 0x83, 0xCA, 0x01, 0xC6, 0x4B, 0x92, 0xEC,
0xF0, 0x32, 0xEA, 0x15, 0xD1, 0x72, 0x1D, 0x03,
0xF4, 0x82, 0xD7, 0xCE, 0x6E, 0x74, 0xFE, 0xF6,
0xD5, 0x5E, 0x70, 0x2F, 0x46, 0x98, 0x0C, 0x82,
0xB5, 0xA8, 0x40, 0x31, 0x90, 0x0B, 0x1C, 0x9E,
0x59, 0xE7, 0xC9, 0x7F, 0xBE, 0xC7, 0xE8, 0xF3,
0x23, 0xA9, 0x7A, 0x7E, 0x36, 0xCC, 0x88, 0xBE,
0x0F, 0x1D, 0x45, 0xB7, 0xFF, 0x58, 0x5A, 0xC5,
0x4B, 0xD4, 0x07, 0xB2, 0x2B, 0x41, 0x54, 0xAA,
0xCC, 0x8F, 0x6D, 0x7E, 0xBF, 0x48, 0xE1, 0xD8,
0x14, 0xCC, 0x5E, 0xD2, 0x0F, 0x80, 0x37, 0xE0,
0xA7, 0x97, 0x15, 0xEE, 0xF2, 0x9B, 0xE3, 0x28,
0x06, 0xA1, 0xD5, 0x8B, 0xB7, 0xC5, 0xDA, 0x76,
0xF5, 0x50, 0xAA, 0x3D, 0x8A, 0x1F, 0xBF, 0xF0,
0xEB, 0x19, 0xCC, 0xB1, 0xA3, 0x13, 0xD5, 0x5C,
0xDA, 0x56, 0xC9, 0xEC, 0x2E, 0xF2, 0x96, 0x32,
0x38, 0x7F, 0xE8, 0xD7, 0x6E, 0x3C, 0x04, 0x68,
0x04, 0x3E, 0x8F, 0x66, 0x3F, 0x48, 0x60, 0xEE,
0x12, 0xBF, 0x2D, 0x5B, 0x0B, 0x74, 0x74, 0xD6,
0xE6, 0x94, 0xF9, 0x1E, 0x6D, 0xBE, 0x11, 0x59,
0x74, 0xA3, 0x92, 0x6F, 0x12, 0xFE, 0xE5, 0xE4,
0x38, 0x77, 0x7C, 0xB6, 0xA9, 0x32, 0xDF, 0x8C,
0xD8, 0xBE, 0xC4, 0xD0, 0x73, 0xB9, 0x31, 0xBA,
0x3B, 0xC8, 0x32, 0xB6, 0x8D, 0x9D, 0xD3, 0x00,
0x74, 0x1F, 0xA7, 0xBF, 0x8A, 0xFC, 0x47, 0xED,
0x25, 0x76, 0xF6, 0x93, 0x6B, 0xA4, 0x24, 0x66,
0x3A, 0xAB, 0x63, 0x9C, 0x5A, 0xE4, 0xF5, 0x68,
0x34, 0x23, 0xB4, 0x74, 0x2B, 0xF1, 0xC9, 0x78,
0x23, 0x8F, 0x16, 0xCB, 0xE3, 0x9D, 0x65, 0x2D,
0xE3, 0xFD, 0xB8, 0xBE, 0xFC, 0x84, 0x8A, 0xD9,
0x22, 0x22, 0x2E, 0x04, 0xA4, 0x03, 0x7C, 0x07,
0x13, 0xEB, 0x57, 0xA8, 0x1A, 0x23, 0xF0, 0xC7,
0x34, 0x73, 0xFC, 0x64, 0x6C, 0xEA, 0x30, 0x6B,
0x4B, 0xCB, 0xC8, 0x86, 0x2F, 0x83, 0x85, 0xDD,
0xFA, 0x9D, 0x4B, 0x7F, 0xA2, 0xC0, 0x87, 0xE8,
0x79, 0x68, 0x33, 0x03, 0xED, 0x5B, 0xDD, 0x3A,
0x06, 0x2B, 0x3C, 0xF5, 0xB3, 0xA2, 0x78, 0xA6,
0x6D, 0x2A, 0x13, 0xF8, 0x3F, 0x44, 0xF8, 0x2D,
0xDF, 0x31, 0x0E, 0xE0, 0x74, 0xAB, 0x6A, 0x36,
0x45, 0x97, 0xE8, 0x99, 0xA0, 0x25, 0x5D, 0xC1,
0x64, 0xF3, 0x1C, 0xC5, 0x08, 0x46, 0x85, 0x1D,
0xF9, 0xAB, 0x48, 0x19, 0x5D, 0xED, 0x7E, 0xA1,
0xB1, 0xD5, 0x10, 0xBD, 0x7E, 0xE7, 0x4D, 0x73,
0xFA, 0xF3, 0x6B, 0xC3, 0x1E, 0xCF, 0xA2, 0x68,
0x35, 0x90, 0x46, 0xF4, 0xEB, 0x87, 0x9F, 0x92,
0x40, 0x09, 0x43, 0x8B, 0x48, 0x1C, 0x6C, 0xD7,
0x88, 0x9A, 0x00, 0x2E, 0xD5, 0xEE, 0x38, 0x2B,
0xC9, 0x19, 0x0D, 0xA6, 0xFC, 0x02, 0x6E, 0x47,
0x95, 0x58, 0xE4, 0x47, 0x56, 0x77, 0xE9, 0xAA,
0x9E, 0x30, 0x50, 0xE2, 0x76, 0x56, 0x94, 0xDF,
0xC8, 0x1F, 0x56, 0xE8, 0x80, 0xB9, 0x6E, 0x71,
0x60, 0xC9, 0x80, 0xDD, 0x98, 0xED, 0xD3, 0xDF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_8192, sizeof(RFC3526_PRIME_8192), bn);
}

Binary file not shown.

View File

@@ -0,0 +1,448 @@
/* crypto/bn/bn_ctx.c */
/* Written by Ulf Moeller for the OpenSSL project. */
/* ====================================================================
* Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
#if !defined(BN_CTX_DEBUG) && !defined(BN_DEBUG)
# ifndef NDEBUG
# define NDEBUG
# endif
#endif
#include <stdio.h>
#include <assert.h>
#include "cryptlib.h"
#include "bn_lcl.h"
/*-
* TODO list
*
* 1. Check a bunch of "(words+1)" type hacks in various bignum functions and
* check they can be safely removed.
* - Check +1 and other ugliness in BN_from_montgomery()
*
* 2. Consider allowing a BN_new_ex() that, at least, lets you specify an
* appropriate 'block' size that will be honoured by bn_expand_internal() to
* prevent piddly little reallocations. OTOH, profiling bignum expansions in
* BN_CTX doesn't show this to be a big issue.
*/
/* How many bignums are in each "pool item"; */
#define BN_CTX_POOL_SIZE 16
/* The stack frame info is resizing, set a first-time expansion size; */
#define BN_CTX_START_FRAMES 32
/***********/
/* BN_POOL */
/***********/
/* A bundle of bignums that can be linked with other bundles */
typedef struct bignum_pool_item {
/* The bignum values */
BIGNUM vals[BN_CTX_POOL_SIZE];
/* Linked-list admin */
struct bignum_pool_item *prev, *next;
} BN_POOL_ITEM;
/* A linked-list of bignums grouped in bundles */
typedef struct bignum_pool {
/* Linked-list admin */
BN_POOL_ITEM *head, *current, *tail;
/* Stack depth and allocation size */
unsigned used, size;
} BN_POOL;
static void BN_POOL_init(BN_POOL *);
static void BN_POOL_finish(BN_POOL *);
#ifndef OPENSSL_NO_DEPRECATED
static void BN_POOL_reset(BN_POOL *);
#endif
static BIGNUM *BN_POOL_get(BN_POOL *);
static void BN_POOL_release(BN_POOL *, unsigned int);
/************/
/* BN_STACK */
/************/
/* A wrapper to manage the "stack frames" */
typedef struct bignum_ctx_stack {
/* Array of indexes into the bignum stack */
unsigned int *indexes;
/* Number of stack frames, and the size of the allocated array */
unsigned int depth, size;
} BN_STACK;
static void BN_STACK_init(BN_STACK *);
static void BN_STACK_finish(BN_STACK *);
#ifndef OPENSSL_NO_DEPRECATED
static void BN_STACK_reset(BN_STACK *);
#endif
static int BN_STACK_push(BN_STACK *, unsigned int);
static unsigned int BN_STACK_pop(BN_STACK *);
/**********/
/* BN_CTX */
/**********/
/* The opaque BN_CTX type */
struct bignum_ctx {
/* The bignum bundles */
BN_POOL pool;
/* The "stack frames", if you will */
BN_STACK stack;
/* The number of bignums currently assigned */
unsigned int used;
/* Depth of stack overflow */
int err_stack;
/* Block "gets" until an "end" (compatibility behaviour) */
int too_many;
};
/* Enable this to find BN_CTX bugs */
#ifdef BN_CTX_DEBUG
static const char *ctxdbg_cur = NULL;
static void ctxdbg(BN_CTX *ctx)
{
unsigned int bnidx = 0, fpidx = 0;
BN_POOL_ITEM *item = ctx->pool.head;
BN_STACK *stack = &ctx->stack;
fprintf(stderr, "(%16p): ", ctx);
while (bnidx < ctx->used) {
fprintf(stderr, "%03x ", item->vals[bnidx++ % BN_CTX_POOL_SIZE].dmax);
if (!(bnidx % BN_CTX_POOL_SIZE))
item = item->next;
}
fprintf(stderr, "\n");
bnidx = 0;
fprintf(stderr, " : ");
while (fpidx < stack->depth) {
while (bnidx++ < stack->indexes[fpidx])
fprintf(stderr, " ");
fprintf(stderr, "^^^ ");
bnidx++;
fpidx++;
}
fprintf(stderr, "\n");
}
# define CTXDBG_ENTRY(str, ctx) do { \
ctxdbg_cur = (str); \
fprintf(stderr,"Starting %s\n", ctxdbg_cur); \
ctxdbg(ctx); \
} while(0)
# define CTXDBG_EXIT(ctx) do { \
fprintf(stderr,"Ending %s\n", ctxdbg_cur); \
ctxdbg(ctx); \
} while(0)
# define CTXDBG_RET(ctx,ret)
#else
# define CTXDBG_ENTRY(str, ctx)
# define CTXDBG_EXIT(ctx)
# define CTXDBG_RET(ctx,ret)
#endif
/*
* This function is an evil legacy and should not be used. This
* implementation is WYSIWYG, though I've done my best.
*/
#ifndef OPENSSL_NO_DEPRECATED
void BN_CTX_init(BN_CTX *ctx)
{
/*
* Assume the caller obtained the context via BN_CTX_new() and so is
* trying to reset it for use. Nothing else makes sense, least of all
* binary compatibility from a time when they could declare a static
* variable.
*/
BN_POOL_reset(&ctx->pool);
BN_STACK_reset(&ctx->stack);
ctx->used = 0;
ctx->err_stack = 0;
ctx->too_many = 0;
}
#endif
BN_CTX *BN_CTX_new(void)
{
BN_CTX *ret = OPENSSL_malloc(sizeof(BN_CTX));
if (!ret) {
BNerr(BN_F_BN_CTX_NEW, ERR_R_MALLOC_FAILURE);
return NULL;
}
/* Initialise the structure */
BN_POOL_init(&ret->pool);
BN_STACK_init(&ret->stack);
ret->used = 0;
ret->err_stack = 0;
ret->too_many = 0;
return ret;
}
void BN_CTX_free(BN_CTX *ctx)
{
if (ctx == NULL)
return;
#ifdef BN_CTX_DEBUG
{
BN_POOL_ITEM *pool = ctx->pool.head;
fprintf(stderr, "BN_CTX_free, stack-size=%d, pool-bignums=%d\n",
ctx->stack.size, ctx->pool.size);
fprintf(stderr, "dmaxs: ");
while (pool) {
unsigned loop = 0;
while (loop < BN_CTX_POOL_SIZE)
fprintf(stderr, "%02x ", pool->vals[loop++].dmax);
pool = pool->next;
}
fprintf(stderr, "\n");
}
#endif
BN_STACK_finish(&ctx->stack);
BN_POOL_finish(&ctx->pool);
OPENSSL_free(ctx);
}
void BN_CTX_start(BN_CTX *ctx)
{
CTXDBG_ENTRY("BN_CTX_start", ctx);
/* If we're already overflowing ... */
if (ctx->err_stack || ctx->too_many)
ctx->err_stack++;
/* (Try to) get a new frame pointer */
else if (!BN_STACK_push(&ctx->stack, ctx->used)) {
BNerr(BN_F_BN_CTX_START, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
ctx->err_stack++;
}
CTXDBG_EXIT(ctx);
}
void BN_CTX_end(BN_CTX *ctx)
{
CTXDBG_ENTRY("BN_CTX_end", ctx);
if (ctx->err_stack)
ctx->err_stack--;
else {
unsigned int fp = BN_STACK_pop(&ctx->stack);
/* Does this stack frame have anything to release? */
if (fp < ctx->used)
BN_POOL_release(&ctx->pool, ctx->used - fp);
ctx->used = fp;
/* Unjam "too_many" in case "get" had failed */
ctx->too_many = 0;
}
CTXDBG_EXIT(ctx);
}
BIGNUM *BN_CTX_get(BN_CTX *ctx)
{
BIGNUM *ret;
CTXDBG_ENTRY("BN_CTX_get", ctx);
if (ctx->err_stack || ctx->too_many)
return NULL;
if ((ret = BN_POOL_get(&ctx->pool)) == NULL) {
/*
* Setting too_many prevents repeated "get" attempts from cluttering
* the error stack.
*/
ctx->too_many = 1;
BNerr(BN_F_BN_CTX_GET, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
return NULL;
}
/* OK, make sure the returned bignum is "zero" */
BN_zero(ret);
ctx->used++;
CTXDBG_RET(ctx, ret);
return ret;
}
/************/
/* BN_STACK */
/************/
static void BN_STACK_init(BN_STACK *st)
{
st->indexes = NULL;
st->depth = st->size = 0;
}
static void BN_STACK_finish(BN_STACK *st)
{
if (st->size)
OPENSSL_free(st->indexes);
}
#ifndef OPENSSL_NO_DEPRECATED
static void BN_STACK_reset(BN_STACK *st)
{
st->depth = 0;
}
#endif
static int BN_STACK_push(BN_STACK *st, unsigned int idx)
{
if (st->depth == st->size)
/* Need to expand */
{
unsigned int newsize = (st->size ?
(st->size * 3 / 2) : BN_CTX_START_FRAMES);
unsigned int *newitems = OPENSSL_malloc(newsize *
sizeof(unsigned int));
if (!newitems)
return 0;
if (st->depth)
memcpy(newitems, st->indexes, st->depth * sizeof(unsigned int));
if (st->size)
OPENSSL_free(st->indexes);
st->indexes = newitems;
st->size = newsize;
}
st->indexes[(st->depth)++] = idx;
return 1;
}
static unsigned int BN_STACK_pop(BN_STACK *st)
{
return st->indexes[--(st->depth)];
}
/***********/
/* BN_POOL */
/***********/
static void BN_POOL_init(BN_POOL *p)
{
p->head = p->current = p->tail = NULL;
p->used = p->size = 0;
}
static void BN_POOL_finish(BN_POOL *p)
{
while (p->head) {
unsigned int loop = 0;
BIGNUM *bn = p->head->vals;
while (loop++ < BN_CTX_POOL_SIZE) {
if (bn->d)
BN_clear_free(bn);
bn++;
}
p->current = p->head->next;
OPENSSL_free(p->head);
p->head = p->current;
}
}
#ifndef OPENSSL_NO_DEPRECATED
static void BN_POOL_reset(BN_POOL *p)
{
BN_POOL_ITEM *item = p->head;
while (item) {
unsigned int loop = 0;
BIGNUM *bn = item->vals;
while (loop++ < BN_CTX_POOL_SIZE) {
if (bn->d)
BN_clear(bn);
bn++;
}
item = item->next;
}
p->current = p->head;
p->used = 0;
}
#endif
static BIGNUM *BN_POOL_get(BN_POOL *p)
{
if (p->used == p->size) {
BIGNUM *bn;
unsigned int loop = 0;
BN_POOL_ITEM *item = OPENSSL_malloc(sizeof(BN_POOL_ITEM));
if (!item)
return NULL;
/* Initialise the structure */
bn = item->vals;
while (loop++ < BN_CTX_POOL_SIZE)
BN_init(bn++);
item->prev = p->tail;
item->next = NULL;
/* Link it in */
if (!p->head)
p->head = p->current = p->tail = item;
else {
p->tail->next = item;
p->tail = item;
p->current = item;
}
p->size += BN_CTX_POOL_SIZE;
p->used++;
/* Return the first bignum from the new pool */
return item->vals;
}
if (!p->used)
p->current = p->head;
else if ((p->used % BN_CTX_POOL_SIZE) == 0)
p->current = p->current->next;
return p->current->vals + ((p->used++) % BN_CTX_POOL_SIZE);
}
static void BN_POOL_release(BN_POOL *p, unsigned int num)
{
unsigned int offset = (p->used - 1) % BN_CTX_POOL_SIZE;
p->used -= num;
while (num--) {
bn_check_top(p->current->vals + offset);
if (!offset) {
offset = BN_CTX_POOL_SIZE - 1;
p->current = p->current->prev;
} else
offset--;
}
}

Binary file not shown.

View File

@@ -0,0 +1,115 @@
/* crypto/bn/bn_depr.c */
/* ====================================================================
* Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
/*
* Support for deprecated functions goes here - static linkage will only
* slurp this code if applications are using them directly.
*/
#include <stdio.h>
#include <time.h>
#include "cryptlib.h"
#include "bn_lcl.h"
#include <openssl/rand.h>
static void *dummy = &dummy;
#ifndef OPENSSL_NO_DEPRECATED
BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe,
const BIGNUM *add, const BIGNUM *rem,
void (*callback) (int, int, void *), void *cb_arg)
{
BN_GENCB cb;
BIGNUM *rnd = NULL;
int found = 0;
BN_GENCB_set_old(&cb, callback, cb_arg);
if (ret == NULL) {
if ((rnd = BN_new()) == NULL)
goto err;
} else
rnd = ret;
if (!BN_generate_prime_ex(rnd, bits, safe, add, rem, &cb))
goto err;
/* we have a prime :-) */
found = 1;
err:
if (!found && (ret == NULL) && (rnd != NULL))
BN_free(rnd);
return (found ? rnd : NULL);
}
int BN_is_prime(const BIGNUM *a, int checks,
void (*callback) (int, int, void *), BN_CTX *ctx_passed,
void *cb_arg)
{
BN_GENCB cb;
BN_GENCB_set_old(&cb, callback, cb_arg);
return BN_is_prime_ex(a, checks, ctx_passed, &cb);
}
int BN_is_prime_fasttest(const BIGNUM *a, int checks,
void (*callback) (int, int, void *),
BN_CTX *ctx_passed, void *cb_arg,
int do_trial_division)
{
BN_GENCB cb;
BN_GENCB_set_old(&cb, callback, cb_arg);
return BN_is_prime_fasttest_ex(a, checks, ctx_passed,
do_trial_division, &cb);
}
#endif

Binary file not shown.

View File

@@ -0,0 +1,477 @@
/* crypto/bn/bn_div.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include <openssl/bn.h>
#include "cryptlib.h"
#include "bn_lcl.h"
/* The old slow way */
#if 0
int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
BN_CTX *ctx)
{
int i, nm, nd;
int ret = 0;
BIGNUM *D;
bn_check_top(m);
bn_check_top(d);
if (BN_is_zero(d)) {
BNerr(BN_F_BN_DIV, BN_R_DIV_BY_ZERO);
return (0);
}
if (BN_ucmp(m, d) < 0) {
if (rem != NULL) {
if (BN_copy(rem, m) == NULL)
return (0);
}
if (dv != NULL)
BN_zero(dv);
return (1);
}
BN_CTX_start(ctx);
D = BN_CTX_get(ctx);
if (dv == NULL)
dv = BN_CTX_get(ctx);
if (rem == NULL)
rem = BN_CTX_get(ctx);
if (D == NULL || dv == NULL || rem == NULL)
goto end;
nd = BN_num_bits(d);
nm = BN_num_bits(m);
if (BN_copy(D, d) == NULL)
goto end;
if (BN_copy(rem, m) == NULL)
goto end;
/*
* The next 2 are needed so we can do a dv->d[0]|=1 later since
* BN_lshift1 will only work once there is a value :-)
*/
BN_zero(dv);
if (bn_wexpand(dv, 1) == NULL)
goto end;
dv->top = 1;
if (!BN_lshift(D, D, nm - nd))
goto end;
for (i = nm - nd; i >= 0; i--) {
if (!BN_lshift1(dv, dv))
goto end;
if (BN_ucmp(rem, D) >= 0) {
dv->d[0] |= 1;
if (!BN_usub(rem, rem, D))
goto end;
}
/* CAN IMPROVE (and have now :=) */
if (!BN_rshift1(D, D))
goto end;
}
rem->neg = BN_is_zero(rem) ? 0 : m->neg;
dv->neg = m->neg ^ d->neg;
ret = 1;
end:
BN_CTX_end(ctx);
return (ret);
}
#else
# if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) \
&& !defined(PEDANTIC) && !defined(BN_DIV3W)
# if defined(__GNUC__) && __GNUC__>=2
# if defined(__i386) || defined (__i386__)
/*-
* There were two reasons for implementing this template:
* - GNU C generates a call to a function (__udivdi3 to be exact)
* in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
* understand why...);
* - divl doesn't only calculate quotient, but also leaves
* remainder in %edx which we can definitely use here:-)
*
* <appro@fy.chalmers.se>
*/
# undef bn_div_words
# define bn_div_words(n0,n1,d0) \
({ asm volatile ( \
"divl %4" \
: "=a"(q), "=d"(rem) \
: "a"(n1), "d"(n0), "g"(d0) \
: "cc"); \
q; \
})
# define REMAINDER_IS_ALREADY_CALCULATED
# elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
/*
* Same story here, but it's 128-bit by 64-bit division. Wow!
* <appro@fy.chalmers.se>
*/
# undef bn_div_words
# define bn_div_words(n0,n1,d0) \
({ asm volatile ( \
"divq %4" \
: "=a"(q), "=d"(rem) \
: "a"(n1), "d"(n0), "g"(d0) \
: "cc"); \
q; \
})
# define REMAINDER_IS_ALREADY_CALCULATED
# endif /* __<cpu> */
# endif /* __GNUC__ */
# endif /* OPENSSL_NO_ASM */
/*-
* BN_div computes dv := num / divisor, rounding towards
* zero, and sets up rm such that dv*divisor + rm = num holds.
* Thus:
* dv->neg == num->neg ^ divisor->neg (unless the result is zero)
* rm->neg == num->neg (unless the remainder is zero)
* If 'dv' or 'rm' is NULL, the respective value is not returned.
*/
int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
BN_CTX *ctx)
{
int norm_shift, i, loop;
BIGNUM *tmp, wnum, *snum, *sdiv, *res;
BN_ULONG *resp, *wnump;
BN_ULONG d0, d1;
int num_n, div_n;
int no_branch = 0;
/*
* Invalid zero-padding would have particularly bad consequences so don't
* just rely on bn_check_top() here (bn_check_top() works only for
* BN_DEBUG builds)
*/
if ((num->top > 0 && num->d[num->top - 1] == 0) ||
(divisor->top > 0 && divisor->d[divisor->top - 1] == 0)) {
BNerr(BN_F_BN_DIV, BN_R_NOT_INITIALIZED);
return 0;
}
bn_check_top(num);
bn_check_top(divisor);
if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0)
|| (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0)) {
no_branch = 1;
}
bn_check_top(dv);
bn_check_top(rm);
/*- bn_check_top(num); *//*
* 'num' has been checked already
*/
/*- bn_check_top(divisor); *//*
* 'divisor' has been checked already
*/
if (BN_is_zero(divisor)) {
BNerr(BN_F_BN_DIV, BN_R_DIV_BY_ZERO);
return (0);
}
if (!no_branch && BN_ucmp(num, divisor) < 0) {
if (rm != NULL) {
if (BN_copy(rm, num) == NULL)
return (0);
}
if (dv != NULL)
BN_zero(dv);
return (1);
}
BN_CTX_start(ctx);
tmp = BN_CTX_get(ctx);
snum = BN_CTX_get(ctx);
sdiv = BN_CTX_get(ctx);
if (dv == NULL)
res = BN_CTX_get(ctx);
else
res = dv;
if (sdiv == NULL || res == NULL || tmp == NULL || snum == NULL)
goto err;
/* First we normalise the numbers */
norm_shift = BN_BITS2 - ((BN_num_bits(divisor)) % BN_BITS2);
if (!(BN_lshift(sdiv, divisor, norm_shift)))
goto err;
sdiv->neg = 0;
norm_shift += BN_BITS2;
if (!(BN_lshift(snum, num, norm_shift)))
goto err;
snum->neg = 0;
if (no_branch) {
/*
* Since we don't know whether snum is larger than sdiv, we pad snum
* with enough zeroes without changing its value.
*/
if (snum->top <= sdiv->top + 1) {
if (bn_wexpand(snum, sdiv->top + 2) == NULL)
goto err;
for (i = snum->top; i < sdiv->top + 2; i++)
snum->d[i] = 0;
snum->top = sdiv->top + 2;
} else {
if (bn_wexpand(snum, snum->top + 1) == NULL)
goto err;
snum->d[snum->top] = 0;
snum->top++;
}
}
div_n = sdiv->top;
num_n = snum->top;
loop = num_n - div_n;
/*
* Lets setup a 'window' into snum This is the part that corresponds to
* the current 'area' being divided
*/
wnum.neg = 0;
wnum.d = &(snum->d[loop]);
wnum.top = div_n;
/*
* only needed when BN_ucmp messes up the values between top and max
*/
wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */
/* Get the top 2 words of sdiv */
/* div_n=sdiv->top; */
d0 = sdiv->d[div_n - 1];
d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];
/* pointer to the 'top' of snum */
wnump = &(snum->d[num_n - 1]);
/* Setup to 'res' */
res->neg = (num->neg ^ divisor->neg);
if (!bn_wexpand(res, (loop + 1)))
goto err;
res->top = loop - no_branch;
resp = &(res->d[loop - 1]);
/* space for temp */
if (!bn_wexpand(tmp, (div_n + 1)))
goto err;
if (!no_branch) {
if (BN_ucmp(&wnum, sdiv) >= 0) {
/*
* If BN_DEBUG_RAND is defined BN_ucmp changes (via bn_pollute)
* the const bignum arguments => clean the values between top and
* max again
*/
bn_clear_top2max(&wnum);
bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
*resp = 1;
} else
res->top--;
}
/*
* if res->top == 0 then clear the neg value otherwise decrease the resp
* pointer
*/
if (res->top == 0)
res->neg = 0;
else
resp--;
for (i = 0; i < loop - 1; i++, wnump--, resp--) {
BN_ULONG q, l0;
/*
* the first part of the loop uses the top two words of snum and sdiv
* to calculate a BN_ULONG q such that | wnum - sdiv * q | < sdiv
*/
# if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
BN_ULONG bn_div_3_words(BN_ULONG *, BN_ULONG, BN_ULONG);
q = bn_div_3_words(wnump, d1, d0);
# else
BN_ULONG n0, n1, rem = 0;
n0 = wnump[0];
n1 = wnump[-1];
if (n0 == d0)
q = BN_MASK2;
else { /* n0 < d0 */
# ifdef BN_LLONG
BN_ULLONG t2;
# if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
q = (BN_ULONG)(((((BN_ULLONG) n0) << BN_BITS2) | n1) / d0);
# else
q = bn_div_words(n0, n1, d0);
# ifdef BN_DEBUG_LEVITTE
fprintf(stderr, "DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
X) -> 0x%08X\n", n0, n1, d0, q);
# endif
# endif
# ifndef REMAINDER_IS_ALREADY_CALCULATED
/*
* rem doesn't have to be BN_ULLONG. The least we
* know it's less that d0, isn't it?
*/
rem = (n1 - q * d0) & BN_MASK2;
# endif
t2 = (BN_ULLONG) d1 *q;
for (;;) {
if (t2 <= ((((BN_ULLONG) rem) << BN_BITS2) | wnump[-2]))
break;
q--;
rem += d0;
if (rem < d0)
break; /* don't let rem overflow */
t2 -= d1;
}
# else /* !BN_LLONG */
BN_ULONG t2l, t2h;
q = bn_div_words(n0, n1, d0);
# ifdef BN_DEBUG_LEVITTE
fprintf(stderr, "DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
X) -> 0x%08X\n", n0, n1, d0, q);
# endif
# ifndef REMAINDER_IS_ALREADY_CALCULATED
rem = (n1 - q * d0) & BN_MASK2;
# endif
# if defined(BN_UMULT_LOHI)
BN_UMULT_LOHI(t2l, t2h, d1, q);
# elif defined(BN_UMULT_HIGH)
t2l = d1 * q;
t2h = BN_UMULT_HIGH(d1, q);
# else
{
BN_ULONG ql, qh;
t2l = LBITS(d1);
t2h = HBITS(d1);
ql = LBITS(q);
qh = HBITS(q);
mul64(t2l, t2h, ql, qh); /* t2=(BN_ULLONG)d1*q; */
}
# endif
for (;;) {
if ((t2h < rem) || ((t2h == rem) && (t2l <= wnump[-2])))
break;
q--;
rem += d0;
if (rem < d0)
break; /* don't let rem overflow */
if (t2l < d1)
t2h--;
t2l -= d1;
}
# endif /* !BN_LLONG */
}
# endif /* !BN_DIV3W */
l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q);
tmp->d[div_n] = l0;
wnum.d--;
/*
* ingore top values of the bignums just sub the two BN_ULONG arrays
* with bn_sub_words
*/
if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n + 1)) {
/*
* Note: As we have considered only the leading two BN_ULONGs in
* the calculation of q, sdiv * q might be greater than wnum (but
* then (q-1) * sdiv is less or equal than wnum)
*/
q--;
if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
/*
* we can't have an overflow here (assuming that q != 0, but
* if q == 0 then tmp is zero anyway)
*/
(*wnump)++;
}
/* store part of the result */
*resp = q;
}
bn_correct_top(snum);
if (rm != NULL) {
/*
* Keep a copy of the neg flag in num because if rm==num BN_rshift()
* will overwrite it.
*/
int neg = num->neg;
BN_rshift(rm, snum, norm_shift);
if (!BN_is_zero(rm))
rm->neg = neg;
bn_check_top(rm);
}
if (no_branch)
bn_correct_top(res);
BN_CTX_end(ctx);
return (1);
err:
bn_check_top(rm);
BN_CTX_end(ctx);
return (0);
}
#endif

Binary file not shown.

View File

@@ -0,0 +1,154 @@
/* crypto/bn/bn_err.c */
/* ====================================================================
* Copyright (c) 1999-2015 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@OpenSSL.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
/*
* NOTE: this file was auto generated by the mkerr.pl script: any changes
* made to it will be overwritten when the script next updates this file,
* only reason strings will be preserved.
*/
#include <stdio.h>
#include <openssl/err.h>
#include <openssl/bn.h>
/* BEGIN ERROR CODES */
#ifndef OPENSSL_NO_ERR
# define ERR_FUNC(func) ERR_PACK(ERR_LIB_BN,func,0)
# define ERR_REASON(reason) ERR_PACK(ERR_LIB_BN,0,reason)
static ERR_STRING_DATA BN_str_functs[] = {
{ERR_FUNC(BN_F_BNRAND), "BNRAND"},
{ERR_FUNC(BN_F_BN_BLINDING_CONVERT_EX), "BN_BLINDING_convert_ex"},
{ERR_FUNC(BN_F_BN_BLINDING_CREATE_PARAM), "BN_BLINDING_create_param"},
{ERR_FUNC(BN_F_BN_BLINDING_INVERT_EX), "BN_BLINDING_invert_ex"},
{ERR_FUNC(BN_F_BN_BLINDING_NEW), "BN_BLINDING_new"},
{ERR_FUNC(BN_F_BN_BLINDING_UPDATE), "BN_BLINDING_update"},
{ERR_FUNC(BN_F_BN_BN2DEC), "BN_bn2dec"},
{ERR_FUNC(BN_F_BN_BN2HEX), "BN_bn2hex"},
{ERR_FUNC(BN_F_BN_CTX_GET), "BN_CTX_get"},
{ERR_FUNC(BN_F_BN_CTX_NEW), "BN_CTX_new"},
{ERR_FUNC(BN_F_BN_CTX_START), "BN_CTX_start"},
{ERR_FUNC(BN_F_BN_DIV), "BN_div"},
{ERR_FUNC(BN_F_BN_DIV_NO_BRANCH), "BN_div_no_branch"},
{ERR_FUNC(BN_F_BN_DIV_RECP), "BN_div_recp"},
{ERR_FUNC(BN_F_BN_EXP), "BN_exp"},
{ERR_FUNC(BN_F_BN_EXPAND2), "bn_expand2"},
{ERR_FUNC(BN_F_BN_EXPAND_INTERNAL), "BN_EXPAND_INTERNAL"},
{ERR_FUNC(BN_F_BN_GF2M_MOD), "BN_GF2m_mod"},
{ERR_FUNC(BN_F_BN_GF2M_MOD_EXP), "BN_GF2m_mod_exp"},
{ERR_FUNC(BN_F_BN_GF2M_MOD_MUL), "BN_GF2m_mod_mul"},
{ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD), "BN_GF2m_mod_solve_quad"},
{ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR), "BN_GF2m_mod_solve_quad_arr"},
{ERR_FUNC(BN_F_BN_GF2M_MOD_SQR), "BN_GF2m_mod_sqr"},
{ERR_FUNC(BN_F_BN_GF2M_MOD_SQRT), "BN_GF2m_mod_sqrt"},
{ERR_FUNC(BN_F_BN_LSHIFT), "BN_lshift"},
{ERR_FUNC(BN_F_BN_MOD_EXP2_MONT), "BN_mod_exp2_mont"},
{ERR_FUNC(BN_F_BN_MOD_EXP_MONT), "BN_mod_exp_mont"},
{ERR_FUNC(BN_F_BN_MOD_EXP_MONT_CONSTTIME), "BN_mod_exp_mont_consttime"},
{ERR_FUNC(BN_F_BN_MOD_EXP_MONT_WORD), "BN_mod_exp_mont_word"},
{ERR_FUNC(BN_F_BN_MOD_EXP_RECP), "BN_mod_exp_recp"},
{ERR_FUNC(BN_F_BN_MOD_EXP_SIMPLE), "BN_mod_exp_simple"},
{ERR_FUNC(BN_F_BN_MOD_INVERSE), "BN_mod_inverse"},
{ERR_FUNC(BN_F_BN_MOD_INVERSE_NO_BRANCH), "BN_mod_inverse_no_branch"},
{ERR_FUNC(BN_F_BN_MOD_LSHIFT_QUICK), "BN_mod_lshift_quick"},
{ERR_FUNC(BN_F_BN_MOD_MUL_RECIPROCAL), "BN_mod_mul_reciprocal"},
{ERR_FUNC(BN_F_BN_MOD_SQRT), "BN_mod_sqrt"},
{ERR_FUNC(BN_F_BN_MPI2BN), "BN_mpi2bn"},
{ERR_FUNC(BN_F_BN_NEW), "BN_new"},
{ERR_FUNC(BN_F_BN_RAND), "BN_rand"},
{ERR_FUNC(BN_F_BN_RAND_RANGE), "BN_rand_range"},
{ERR_FUNC(BN_F_BN_RSHIFT), "BN_rshift"},
{ERR_FUNC(BN_F_BN_USUB), "BN_usub"},
{0, NULL}
};
static ERR_STRING_DATA BN_str_reasons[] = {
{ERR_REASON(BN_R_ARG2_LT_ARG3), "arg2 lt arg3"},
{ERR_REASON(BN_R_BAD_RECIPROCAL), "bad reciprocal"},
{ERR_REASON(BN_R_BIGNUM_TOO_LONG), "bignum too long"},
{ERR_REASON(BN_R_BITS_TOO_SMALL), "bits too small"},
{ERR_REASON(BN_R_CALLED_WITH_EVEN_MODULUS), "called with even modulus"},
{ERR_REASON(BN_R_DIV_BY_ZERO), "div by zero"},
{ERR_REASON(BN_R_ENCODING_ERROR), "encoding error"},
{ERR_REASON(BN_R_EXPAND_ON_STATIC_BIGNUM_DATA),
"expand on static bignum data"},
{ERR_REASON(BN_R_INPUT_NOT_REDUCED), "input not reduced"},
{ERR_REASON(BN_R_INVALID_LENGTH), "invalid length"},
{ERR_REASON(BN_R_INVALID_RANGE), "invalid range"},
{ERR_REASON(BN_R_INVALID_SHIFT), "invalid shift"},
{ERR_REASON(BN_R_NOT_A_SQUARE), "not a square"},
{ERR_REASON(BN_R_NOT_INITIALIZED), "not initialized"},
{ERR_REASON(BN_R_NO_INVERSE), "no inverse"},
{ERR_REASON(BN_R_NO_SOLUTION), "no solution"},
{ERR_REASON(BN_R_P_IS_NOT_PRIME), "p is not prime"},
{ERR_REASON(BN_R_TOO_MANY_ITERATIONS), "too many iterations"},
{ERR_REASON(BN_R_TOO_MANY_TEMPORARY_VARIABLES),
"too many temporary variables"},
{0, NULL}
};
#endif
void ERR_load_BN_strings(void)
{
#ifndef OPENSSL_NO_ERR
if (ERR_func_error_string(BN_str_functs[0].error) == NULL) {
ERR_load_strings(0, BN_str_functs);
ERR_load_strings(0, BN_str_reasons);
}
#endif
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -0,0 +1,303 @@
/* crypto/bn/bn_exp2.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* ====================================================================
* Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
#include <stdio.h>
#include "cryptlib.h"
#include "bn_lcl.h"
#define TABLE_SIZE 32
int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,
const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
BN_CTX *ctx, BN_MONT_CTX *in_mont)
{
int i, j, bits, b, bits1, bits2, ret =
0, wpos1, wpos2, window1, window2, wvalue1, wvalue2;
int r_is_one = 1;
BIGNUM *d, *r;
const BIGNUM *a_mod_m;
/* Tables of variables obtained from 'ctx' */
BIGNUM *val1[TABLE_SIZE], *val2[TABLE_SIZE];
BN_MONT_CTX *mont = NULL;
bn_check_top(a1);
bn_check_top(p1);
bn_check_top(a2);
bn_check_top(p2);
bn_check_top(m);
if (!(m->d[0] & 1)) {
BNerr(BN_F_BN_MOD_EXP2_MONT, BN_R_CALLED_WITH_EVEN_MODULUS);
return (0);
}
bits1 = BN_num_bits(p1);
bits2 = BN_num_bits(p2);
if ((bits1 == 0) && (bits2 == 0)) {
ret = BN_one(rr);
return ret;
}
bits = (bits1 > bits2) ? bits1 : bits2;
BN_CTX_start(ctx);
d = BN_CTX_get(ctx);
r = BN_CTX_get(ctx);
val1[0] = BN_CTX_get(ctx);
val2[0] = BN_CTX_get(ctx);
if (!d || !r || !val1[0] || !val2[0])
goto err;
if (in_mont != NULL)
mont = in_mont;
else {
if ((mont = BN_MONT_CTX_new()) == NULL)
goto err;
if (!BN_MONT_CTX_set(mont, m, ctx))
goto err;
}
window1 = BN_window_bits_for_exponent_size(bits1);
window2 = BN_window_bits_for_exponent_size(bits2);
/*
* Build table for a1: val1[i] := a1^(2*i + 1) mod m for i = 0 .. 2^(window1-1)
*/
if (a1->neg || BN_ucmp(a1, m) >= 0) {
if (!BN_mod(val1[0], a1, m, ctx))
goto err;
a_mod_m = val1[0];
} else
a_mod_m = a1;
if (BN_is_zero(a_mod_m)) {
BN_zero(rr);
ret = 1;
goto err;
}
if (!BN_to_montgomery(val1[0], a_mod_m, mont, ctx))
goto err;
if (window1 > 1) {
if (!BN_mod_mul_montgomery(d, val1[0], val1[0], mont, ctx))
goto err;
j = 1 << (window1 - 1);
for (i = 1; i < j; i++) {
if (((val1[i] = BN_CTX_get(ctx)) == NULL) ||
!BN_mod_mul_montgomery(val1[i], val1[i - 1], d, mont, ctx))
goto err;
}
}
/*
* Build table for a2: val2[i] := a2^(2*i + 1) mod m for i = 0 .. 2^(window2-1)
*/
if (a2->neg || BN_ucmp(a2, m) >= 0) {
if (!BN_mod(val2[0], a2, m, ctx))
goto err;
a_mod_m = val2[0];
} else
a_mod_m = a2;
if (BN_is_zero(a_mod_m)) {
BN_zero(rr);
ret = 1;
goto err;
}
if (!BN_to_montgomery(val2[0], a_mod_m, mont, ctx))
goto err;
if (window2 > 1) {
if (!BN_mod_mul_montgomery(d, val2[0], val2[0], mont, ctx))
goto err;
j = 1 << (window2 - 1);
for (i = 1; i < j; i++) {
if (((val2[i] = BN_CTX_get(ctx)) == NULL) ||
!BN_mod_mul_montgomery(val2[i], val2[i - 1], d, mont, ctx))
goto err;
}
}
/* Now compute the power product, using independent windows. */
r_is_one = 1;
wvalue1 = 0; /* The 'value' of the first window */
wvalue2 = 0; /* The 'value' of the second window */
wpos1 = 0; /* If wvalue1 > 0, the bottom bit of the
* first window */
wpos2 = 0; /* If wvalue2 > 0, the bottom bit of the
* second window */
if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
goto err;
for (b = bits - 1; b >= 0; b--) {
if (!r_is_one) {
if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
goto err;
}
if (!wvalue1)
if (BN_is_bit_set(p1, b)) {
/*
* consider bits b-window1+1 .. b for this window
*/
i = b - window1 + 1;
while (!BN_is_bit_set(p1, i)) /* works for i<0 */
i++;
wpos1 = i;
wvalue1 = 1;
for (i = b - 1; i >= wpos1; i--) {
wvalue1 <<= 1;
if (BN_is_bit_set(p1, i))
wvalue1++;
}
}
if (!wvalue2)
if (BN_is_bit_set(p2, b)) {
/*
* consider bits b-window2+1 .. b for this window
*/
i = b - window2 + 1;
while (!BN_is_bit_set(p2, i))
i++;
wpos2 = i;
wvalue2 = 1;
for (i = b - 1; i >= wpos2; i--) {
wvalue2 <<= 1;
if (BN_is_bit_set(p2, i))
wvalue2++;
}
}
if (wvalue1 && b == wpos1) {
/* wvalue1 is odd and < 2^window1 */
if (!BN_mod_mul_montgomery(r, r, val1[wvalue1 >> 1], mont, ctx))
goto err;
wvalue1 = 0;
r_is_one = 0;
}
if (wvalue2 && b == wpos2) {
/* wvalue2 is odd and < 2^window2 */
if (!BN_mod_mul_montgomery(r, r, val2[wvalue2 >> 1], mont, ctx))
goto err;
wvalue2 = 0;
r_is_one = 0;
}
}
if (!BN_from_montgomery(rr, r, mont, ctx))
goto err;
ret = 1;
err:
if ((in_mont == NULL) && (mont != NULL))
BN_MONT_CTX_free(mont);
BN_CTX_end(ctx);
bn_check_top(rr);
return (ret);
}

Binary file not shown.

View File

@@ -0,0 +1,702 @@
/* crypto/bn/bn_gcd.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* ====================================================================
* Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
#include "cryptlib.h"
#include "bn_lcl.h"
static BIGNUM *euclid(BIGNUM *a, BIGNUM *b);
int BN_gcd(BIGNUM *r, const BIGNUM *in_a, const BIGNUM *in_b, BN_CTX *ctx)
{
BIGNUM *a, *b, *t;
int ret = 0;
bn_check_top(in_a);
bn_check_top(in_b);
BN_CTX_start(ctx);
a = BN_CTX_get(ctx);
b = BN_CTX_get(ctx);
if (a == NULL || b == NULL)
goto err;
if (BN_copy(a, in_a) == NULL)
goto err;
if (BN_copy(b, in_b) == NULL)
goto err;
a->neg = 0;
b->neg = 0;
if (BN_cmp(a, b) < 0) {
t = a;
a = b;
b = t;
}
t = euclid(a, b);
if (t == NULL)
goto err;
if (BN_copy(r, t) == NULL)
goto err;
ret = 1;
err:
BN_CTX_end(ctx);
bn_check_top(r);
return (ret);
}
static BIGNUM *euclid(BIGNUM *a, BIGNUM *b)
{
BIGNUM *t;
int shifts = 0;
bn_check_top(a);
bn_check_top(b);
/* 0 <= b <= a */
while (!BN_is_zero(b)) {
/* 0 < b <= a */
if (BN_is_odd(a)) {
if (BN_is_odd(b)) {
if (!BN_sub(a, a, b))
goto err;
if (!BN_rshift1(a, a))
goto err;
if (BN_cmp(a, b) < 0) {
t = a;
a = b;
b = t;
}
} else { /* a odd - b even */
if (!BN_rshift1(b, b))
goto err;
if (BN_cmp(a, b) < 0) {
t = a;
a = b;
b = t;
}
}
} else { /* a is even */
if (BN_is_odd(b)) {
if (!BN_rshift1(a, a))
goto err;
if (BN_cmp(a, b) < 0) {
t = a;
a = b;
b = t;
}
} else { /* a even - b even */
if (!BN_rshift1(a, a))
goto err;
if (!BN_rshift1(b, b))
goto err;
shifts++;
}
}
/* 0 <= b <= a */
}
if (shifts) {
if (!BN_lshift(a, a, shifts))
goto err;
}
bn_check_top(a);
return (a);
err:
return (NULL);
}
/* solves ax == 1 (mod n) */
static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n,
BN_CTX *ctx);
BIGNUM *BN_mod_inverse(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
{
BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
BIGNUM *ret = NULL;
int sign;
if ((BN_get_flags(a, BN_FLG_CONSTTIME) != 0)
|| (BN_get_flags(n, BN_FLG_CONSTTIME) != 0)) {
return BN_mod_inverse_no_branch(in, a, n, ctx);
}
bn_check_top(a);
bn_check_top(n);
BN_CTX_start(ctx);
A = BN_CTX_get(ctx);
B = BN_CTX_get(ctx);
X = BN_CTX_get(ctx);
D = BN_CTX_get(ctx);
M = BN_CTX_get(ctx);
Y = BN_CTX_get(ctx);
T = BN_CTX_get(ctx);
if (T == NULL)
goto err;
if (in == NULL)
R = BN_new();
else
R = in;
if (R == NULL)
goto err;
BN_one(X);
BN_zero(Y);
if (BN_copy(B, a) == NULL)
goto err;
if (BN_copy(A, n) == NULL)
goto err;
A->neg = 0;
if (B->neg || (BN_ucmp(B, A) >= 0)) {
if (!BN_nnmod(B, B, A, ctx))
goto err;
}
sign = -1;
/*-
* From B = a mod |n|, A = |n| it follows that
*
* 0 <= B < A,
* -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|).
*/
if (BN_is_odd(n) && (BN_num_bits(n) <= (BN_BITS <= 32 ? 450 : 2048))) {
/*
* Binary inversion algorithm; requires odd modulus. This is faster
* than the general algorithm if the modulus is sufficiently small
* (about 400 .. 500 bits on 32-bit sytems, but much more on 64-bit
* systems)
*/
int shift;
while (!BN_is_zero(B)) {
/*-
* 0 < B < |n|,
* 0 < A <= |n|,
* (1) -sign*X*a == B (mod |n|),
* (2) sign*Y*a == A (mod |n|)
*/
/*
* Now divide B by the maximum possible power of two in the
* integers, and divide X by the same value mod |n|. When we're
* done, (1) still holds.
*/
shift = 0;
while (!BN_is_bit_set(B, shift)) { /* note that 0 < B */
shift++;
if (BN_is_odd(X)) {
if (!BN_uadd(X, X, n))
goto err;
}
/*
* now X is even, so we can easily divide it by two
*/
if (!BN_rshift1(X, X))
goto err;
}
if (shift > 0) {
if (!BN_rshift(B, B, shift))
goto err;
}
/*
* Same for A and Y. Afterwards, (2) still holds.
*/
shift = 0;
while (!BN_is_bit_set(A, shift)) { /* note that 0 < A */
shift++;
if (BN_is_odd(Y)) {
if (!BN_uadd(Y, Y, n))
goto err;
}
/* now Y is even */
if (!BN_rshift1(Y, Y))
goto err;
}
if (shift > 0) {
if (!BN_rshift(A, A, shift))
goto err;
}
/*-
* We still have (1) and (2).
* Both A and B are odd.
* The following computations ensure that
*
* 0 <= B < |n|,
* 0 < A < |n|,
* (1) -sign*X*a == B (mod |n|),
* (2) sign*Y*a == A (mod |n|),
*
* and that either A or B is even in the next iteration.
*/
if (BN_ucmp(B, A) >= 0) {
/* -sign*(X + Y)*a == B - A (mod |n|) */
if (!BN_uadd(X, X, Y))
goto err;
/*
* NB: we could use BN_mod_add_quick(X, X, Y, n), but that
* actually makes the algorithm slower
*/
if (!BN_usub(B, B, A))
goto err;
} else {
/* sign*(X + Y)*a == A - B (mod |n|) */
if (!BN_uadd(Y, Y, X))
goto err;
/*
* as above, BN_mod_add_quick(Y, Y, X, n) would slow things
* down
*/
if (!BN_usub(A, A, B))
goto err;
}
}
} else {
/* general inversion algorithm */
while (!BN_is_zero(B)) {
BIGNUM *tmp;
/*-
* 0 < B < A,
* (*) -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|)
*/
/* (D, M) := (A/B, A%B) ... */
if (BN_num_bits(A) == BN_num_bits(B)) {
if (!BN_one(D))
goto err;
if (!BN_sub(M, A, B))
goto err;
} else if (BN_num_bits(A) == BN_num_bits(B) + 1) {
/* A/B is 1, 2, or 3 */
if (!BN_lshift1(T, B))
goto err;
if (BN_ucmp(A, T) < 0) {
/* A < 2*B, so D=1 */
if (!BN_one(D))
goto err;
if (!BN_sub(M, A, B))
goto err;
} else {
/* A >= 2*B, so D=2 or D=3 */
if (!BN_sub(M, A, T))
goto err;
if (!BN_add(D, T, B))
goto err; /* use D (:= 3*B) as temp */
if (BN_ucmp(A, D) < 0) {
/* A < 3*B, so D=2 */
if (!BN_set_word(D, 2))
goto err;
/*
* M (= A - 2*B) already has the correct value
*/
} else {
/* only D=3 remains */
if (!BN_set_word(D, 3))
goto err;
/*
* currently M = A - 2*B, but we need M = A - 3*B
*/
if (!BN_sub(M, M, B))
goto err;
}
}
} else {
if (!BN_div(D, M, A, B, ctx))
goto err;
}
/*-
* Now
* A = D*B + M;
* thus we have
* (**) sign*Y*a == D*B + M (mod |n|).
*/
tmp = A; /* keep the BIGNUM object, the value does not
* matter */
/* (A, B) := (B, A mod B) ... */
A = B;
B = M;
/* ... so we have 0 <= B < A again */
/*-
* Since the former M is now B and the former B is now A,
* (**) translates into
* sign*Y*a == D*A + B (mod |n|),
* i.e.
* sign*Y*a - D*A == B (mod |n|).
* Similarly, (*) translates into
* -sign*X*a == A (mod |n|).
*
* Thus,
* sign*Y*a + D*sign*X*a == B (mod |n|),
* i.e.
* sign*(Y + D*X)*a == B (mod |n|).
*
* So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
* -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|).
* Note that X and Y stay non-negative all the time.
*/
/*
* most of the time D is very small, so we can optimize tmp :=
* D*X+Y
*/
if (BN_is_one(D)) {
if (!BN_add(tmp, X, Y))
goto err;
} else {
if (BN_is_word(D, 2)) {
if (!BN_lshift1(tmp, X))
goto err;
} else if (BN_is_word(D, 4)) {
if (!BN_lshift(tmp, X, 2))
goto err;
} else if (D->top == 1) {
if (!BN_copy(tmp, X))
goto err;
if (!BN_mul_word(tmp, D->d[0]))
goto err;
} else {
if (!BN_mul(tmp, D, X, ctx))
goto err;
}
if (!BN_add(tmp, tmp, Y))
goto err;
}
M = Y; /* keep the BIGNUM object, the value does not
* matter */
Y = X;
X = tmp;
sign = -sign;
}
}
/*-
* The while loop (Euclid's algorithm) ends when
* A == gcd(a,n);
* we have
* sign*Y*a == A (mod |n|),
* where Y is non-negative.
*/
if (sign < 0) {
if (!BN_sub(Y, n, Y))
goto err;
}
/* Now Y*a == A (mod |n|). */
if (BN_is_one(A)) {
/* Y*a == 1 (mod |n|) */
if (!Y->neg && BN_ucmp(Y, n) < 0) {
if (!BN_copy(R, Y))
goto err;
} else {
if (!BN_nnmod(R, Y, n, ctx))
goto err;
}
} else {
BNerr(BN_F_BN_MOD_INVERSE, BN_R_NO_INVERSE);
goto err;
}
ret = R;
err:
if ((ret == NULL) && (in == NULL))
BN_free(R);
BN_CTX_end(ctx);
bn_check_top(ret);
return (ret);
}
/*
* BN_mod_inverse_no_branch is a special version of BN_mod_inverse. It does
* not contain branches that may leak sensitive information.
*/
static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n,
BN_CTX *ctx)
{
BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
BIGNUM local_A, local_B;
BIGNUM *pA, *pB;
BIGNUM *ret = NULL;
int sign;
bn_check_top(a);
bn_check_top(n);
BN_CTX_start(ctx);
A = BN_CTX_get(ctx);
B = BN_CTX_get(ctx);
X = BN_CTX_get(ctx);
D = BN_CTX_get(ctx);
M = BN_CTX_get(ctx);
Y = BN_CTX_get(ctx);
T = BN_CTX_get(ctx);
if (T == NULL)
goto err;
if (in == NULL)
R = BN_new();
else
R = in;
if (R == NULL)
goto err;
BN_one(X);
BN_zero(Y);
if (BN_copy(B, a) == NULL)
goto err;
if (BN_copy(A, n) == NULL)
goto err;
A->neg = 0;
if (B->neg || (BN_ucmp(B, A) >= 0)) {
/*
* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
* BN_div_no_branch will be called eventually.
*/
pB = &local_B;
local_B.flags = 0;
BN_with_flags(pB, B, BN_FLG_CONSTTIME);
if (!BN_nnmod(B, pB, A, ctx))
goto err;
}
sign = -1;
/*-
* From B = a mod |n|, A = |n| it follows that
*
* 0 <= B < A,
* -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|).
*/
while (!BN_is_zero(B)) {
BIGNUM *tmp;
/*-
* 0 < B < A,
* (*) -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|)
*/
/*
* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
* BN_div_no_branch will be called eventually.
*/
pA = &local_A;
local_A.flags = 0;
BN_with_flags(pA, A, BN_FLG_CONSTTIME);
/* (D, M) := (A/B, A%B) ... */
if (!BN_div(D, M, pA, B, ctx))
goto err;
/*-
* Now
* A = D*B + M;
* thus we have
* (**) sign*Y*a == D*B + M (mod |n|).
*/
tmp = A; /* keep the BIGNUM object, the value does not
* matter */
/* (A, B) := (B, A mod B) ... */
A = B;
B = M;
/* ... so we have 0 <= B < A again */
/*-
* Since the former M is now B and the former B is now A,
* (**) translates into
* sign*Y*a == D*A + B (mod |n|),
* i.e.
* sign*Y*a - D*A == B (mod |n|).
* Similarly, (*) translates into
* -sign*X*a == A (mod |n|).
*
* Thus,
* sign*Y*a + D*sign*X*a == B (mod |n|),
* i.e.
* sign*(Y + D*X)*a == B (mod |n|).
*
* So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
* -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|).
* Note that X and Y stay non-negative all the time.
*/
if (!BN_mul(tmp, D, X, ctx))
goto err;
if (!BN_add(tmp, tmp, Y))
goto err;
M = Y; /* keep the BIGNUM object, the value does not
* matter */
Y = X;
X = tmp;
sign = -sign;
}
/*-
* The while loop (Euclid's algorithm) ends when
* A == gcd(a,n);
* we have
* sign*Y*a == A (mod |n|),
* where Y is non-negative.
*/
if (sign < 0) {
if (!BN_sub(Y, n, Y))
goto err;
}
/* Now Y*a == A (mod |n|). */
if (BN_is_one(A)) {
/* Y*a == 1 (mod |n|) */
if (!Y->neg && BN_ucmp(Y, n) < 0) {
if (!BN_copy(R, Y))
goto err;
} else {
if (!BN_nnmod(R, Y, n, ctx))
goto err;
}
} else {
BNerr(BN_F_BN_MOD_INVERSE_NO_BRANCH, BN_R_NO_INVERSE);
goto err;
}
ret = R;
err:
if ((ret == NULL) && (in == NULL))
BN_free(R);
BN_CTX_end(ctx);
bn_check_top(ret);
return (ret);
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -0,0 +1,186 @@
/* crypto/bn/bn_kron.c */
/* ====================================================================
* Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
#include "cryptlib.h"
#include "bn_lcl.h"
/* least significant word */
#define BN_lsw(n) (((n)->top == 0) ? (BN_ULONG) 0 : (n)->d[0])
/* Returns -2 for errors because both -1 and 0 are valid results. */
int BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
{
int i;
int ret = -2; /* avoid 'uninitialized' warning */
int err = 0;
BIGNUM *A, *B, *tmp;
/*-
* In 'tab', only odd-indexed entries are relevant:
* For any odd BIGNUM n,
* tab[BN_lsw(n) & 7]
* is $(-1)^{(n^2-1)/8}$ (using TeX notation).
* Note that the sign of n does not matter.
*/
static const int tab[8] = { 0, 1, 0, -1, 0, -1, 0, 1 };
bn_check_top(a);
bn_check_top(b);
BN_CTX_start(ctx);
A = BN_CTX_get(ctx);
B = BN_CTX_get(ctx);
if (B == NULL)
goto end;
err = !BN_copy(A, a);
if (err)
goto end;
err = !BN_copy(B, b);
if (err)
goto end;
/*
* Kronecker symbol, imlemented according to Henri Cohen,
* "A Course in Computational Algebraic Number Theory"
* (algorithm 1.4.10).
*/
/* Cohen's step 1: */
if (BN_is_zero(B)) {
ret = BN_abs_is_word(A, 1);
goto end;
}
/* Cohen's step 2: */
if (!BN_is_odd(A) && !BN_is_odd(B)) {
ret = 0;
goto end;
}
/* now B is non-zero */
i = 0;
while (!BN_is_bit_set(B, i))
i++;
err = !BN_rshift(B, B, i);
if (err)
goto end;
if (i & 1) {
/* i is odd */
/* (thus B was even, thus A must be odd!) */
/* set 'ret' to $(-1)^{(A^2-1)/8}$ */
ret = tab[BN_lsw(A) & 7];
} else {
/* i is even */
ret = 1;
}
if (B->neg) {
B->neg = 0;
if (A->neg)
ret = -ret;
}
/*
* now B is positive and odd, so what remains to be done is to compute
* the Jacobi symbol (A/B) and multiply it by 'ret'
*/
while (1) {
/* Cohen's step 3: */
/* B is positive and odd */
if (BN_is_zero(A)) {
ret = BN_is_one(B) ? ret : 0;
goto end;
}
/* now A is non-zero */
i = 0;
while (!BN_is_bit_set(A, i))
i++;
err = !BN_rshift(A, A, i);
if (err)
goto end;
if (i & 1) {
/* i is odd */
/* multiply 'ret' by $(-1)^{(B^2-1)/8}$ */
ret = ret * tab[BN_lsw(B) & 7];
}
/* Cohen's step 4: */
/* multiply 'ret' by $(-1)^{(A-1)(B-1)/4}$ */
if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2)
ret = -ret;
/* (A, B) := (B mod |A|, |A|) */
err = !BN_nnmod(B, B, A, ctx);
if (err)
goto end;
tmp = A;
A = B;
B = tmp;
tmp->neg = 0;
}
end:
BN_CTX_end(ctx);
if (err)
return -2;
else
return ret;
}

Binary file not shown.

View File

@@ -0,0 +1,537 @@
/* crypto/bn/bn_lcl.h */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* ====================================================================
* Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
#ifndef HEADER_BN_LCL_H
# define HEADER_BN_LCL_H
# include <openssl/bn.h>
#ifdef __cplusplus
extern "C" {
#endif
/*-
* BN_window_bits_for_exponent_size -- macro for sliding window mod_exp functions
*
*
* For window size 'w' (w >= 2) and a random 'b' bits exponent,
* the number of multiplications is a constant plus on average
*
* 2^(w-1) + (b-w)/(w+1);
*
* here 2^(w-1) is for precomputing the table (we actually need
* entries only for windows that have the lowest bit set), and
* (b-w)/(w+1) is an approximation for the expected number of
* w-bit windows, not counting the first one.
*
* Thus we should use
*
* w >= 6 if b > 671
* w = 5 if 671 > b > 239
* w = 4 if 239 > b > 79
* w = 3 if 79 > b > 23
* w <= 2 if 23 > b
*
* (with draws in between). Very small exponents are often selected
* with low Hamming weight, so we use w = 1 for b <= 23.
*/
# if 1
# define BN_window_bits_for_exponent_size(b) \
((b) > 671 ? 6 : \
(b) > 239 ? 5 : \
(b) > 79 ? 4 : \
(b) > 23 ? 3 : 1)
# else
/*
* Old SSLeay/OpenSSL table. Maximum window size was 5, so this table differs
* for b==1024; but it coincides for other interesting values (b==160,
* b==512).
*/
# define BN_window_bits_for_exponent_size(b) \
((b) > 255 ? 5 : \
(b) > 127 ? 4 : \
(b) > 17 ? 3 : 1)
# endif
/*
* BN_mod_exp_mont_conttime is based on the assumption that the L1 data cache
* line width of the target processor is at least the following value.
*/
# define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH ( 64 )
# define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
/*
* Window sizes optimized for fixed window size modular exponentiation
* algorithm (BN_mod_exp_mont_consttime). To achieve the security goals of
* BN_mode_exp_mont_consttime, the maximum size of the window must not exceed
* log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH). Window size thresholds are
* defined for cache line sizes of 32 and 64, cache line sizes where
* log_2(32)=5 and log_2(64)=6 respectively. A window size of 7 should only be
* used on processors that have a 128 byte or greater cache line size.
*/
# if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
# define BN_window_bits_for_ctime_exponent_size(b) \
((b) > 937 ? 6 : \
(b) > 306 ? 5 : \
(b) > 89 ? 4 : \
(b) > 22 ? 3 : 1)
# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)
# elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
# define BN_window_bits_for_ctime_exponent_size(b) \
((b) > 306 ? 5 : \
(b) > 89 ? 4 : \
(b) > 22 ? 3 : 1)
# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)
# endif
/* Pentium pro 16,16,16,32,64 */
/* Alpha 16,16,16,16.64 */
# define BN_MULL_SIZE_NORMAL (16)/* 32 */
# define BN_MUL_RECURSIVE_SIZE_NORMAL (16)/* 32 less than */
# define BN_SQR_RECURSIVE_SIZE_NORMAL (16)/* 32 */
# define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32)/* 32 */
# define BN_MONT_CTX_SET_SIZE_WORD (64)/* 32 */
/*
* 2011-02-22 SMS. In various places, a size_t variable or a type cast to
* size_t was used to perform integer-only operations on pointers. This
* failed on VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t
* is still only 32 bits. What's needed in these cases is an integer type
* with the same size as a pointer, which size_t is not certain to be. The
* only fix here is VMS-specific.
*/
# if defined(OPENSSL_SYS_VMS)
# if __INITIAL_POINTER_SIZE == 64
# define PTR_SIZE_INT long long
# else /* __INITIAL_POINTER_SIZE == 64 */
# define PTR_SIZE_INT int
# endif /* __INITIAL_POINTER_SIZE == 64 [else] */
# elif !defined(PTR_SIZE_INT) /* defined(OPENSSL_SYS_VMS) */
# define PTR_SIZE_INT size_t
# endif /* defined(OPENSSL_SYS_VMS) [else] */
# if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
/*
* BN_UMULT_HIGH section.
*
* No, I'm not trying to overwhelm you when stating that the
* product of N-bit numbers is 2*N bits wide:-) No, I don't expect
* you to be impressed when I say that if the compiler doesn't
* support 2*N integer type, then you have to replace every N*N
* multiplication with 4 (N/2)*(N/2) accompanied by some shifts
* and additions which unavoidably results in severe performance
* penalties. Of course provided that the hardware is capable of
* producing 2*N result... That's when you normally start
* considering assembler implementation. However! It should be
* pointed out that some CPUs (most notably Alpha, PowerPC and
* upcoming IA-64 family:-) provide *separate* instruction
* calculating the upper half of the product placing the result
* into a general purpose register. Now *if* the compiler supports
* inline assembler, then it's not impossible to implement the
* "bignum" routines (and have the compiler optimize 'em)
* exhibiting "native" performance in C. That's what BN_UMULT_HIGH
* macro is about:-)
*
* <appro@fy.chalmers.se>
*/
# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
# if defined(__DECC)
# include <c_asm.h>
# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
# elif defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("umulh %1,%2,%0" \
: "=r"(ret) \
: "r"(a), "r"(b)); \
ret; })
# endif /* compiler */
# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("mulhdu %0,%1,%2" \
: "=r"(ret) \
: "r"(a), "r"(b)); \
ret; })
# endif /* compiler */
# elif (defined(__x86_64) || defined(__x86_64__)) && \
(defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret,discard; \
asm ("mulq %3" \
: "=a"(discard),"=d"(ret) \
: "a"(a), "g"(b) \
: "cc"); \
ret; })
# define BN_UMULT_LOHI(low,high,a,b) \
asm ("mulq %3" \
: "=a"(low),"=d"(high) \
: "a"(a),"g"(b) \
: "cc");
# endif
# elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
# if defined(_MSC_VER) && _MSC_VER>=1400
unsigned __int64 __umulh(unsigned __int64 a, unsigned __int64 b);
unsigned __int64 _umul128(unsigned __int64 a, unsigned __int64 b,
unsigned __int64 *h);
# pragma intrinsic(__umulh,_umul128)
# define BN_UMULT_HIGH(a,b) __umulh((a),(b))
# define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
# endif
# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
# if defined(__GNUC__) && __GNUC__>=2
# if __GNUC__>4 || (__GNUC__>=4 && __GNUC_MINOR__>=4)
/* "h" constraint is no more since 4.4 */
# define BN_UMULT_HIGH(a,b) (((__uint128_t)(a)*(b))>>64)
# define BN_UMULT_LOHI(low,high,a,b) ({ \
__uint128_t ret=(__uint128_t)(a)*(b); \
(high)=ret>>64; (low)=ret; })
# else
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("dmultu %1,%2" \
: "=h"(ret) \
: "r"(a), "r"(b) : "l"); \
ret; })
# define BN_UMULT_LOHI(low,high,a,b)\
asm ("dmultu %2,%3" \
: "=l"(low),"=h"(high) \
: "r"(a), "r"(b));
# endif
# endif
# elif defined(__aarch64__) && defined(SIXTY_FOUR_BIT_LONG)
# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("umulh %0,%1,%2" \
: "=r"(ret) \
: "r"(a), "r"(b)); \
ret; })
# endif
# endif /* cpu */
# endif /* OPENSSL_NO_ASM */
/*************************************************************
* Using the long long type
*/
# define Lw(t) (((BN_ULONG)(t))&BN_MASK2)
# define Hw(t) (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
# ifdef BN_DEBUG_RAND
# define bn_clear_top2max(a) \
{ \
int ind = (a)->dmax - (a)->top; \
BN_ULONG *ftl = &(a)->d[(a)->top-1]; \
for (; ind != 0; ind--) \
*(++ftl) = 0x0; \
}
# else
# define bn_clear_top2max(a)
# endif
# ifdef BN_LLONG
# define mul_add(r,a,w,c) { \
BN_ULLONG t; \
t=(BN_ULLONG)w * (a) + (r) + (c); \
(r)= Lw(t); \
(c)= Hw(t); \
}
# define mul(r,a,w,c) { \
BN_ULLONG t; \
t=(BN_ULLONG)w * (a) + (c); \
(r)= Lw(t); \
(c)= Hw(t); \
}
# define sqr(r0,r1,a) { \
BN_ULLONG t; \
t=(BN_ULLONG)(a)*(a); \
(r0)=Lw(t); \
(r1)=Hw(t); \
}
# elif defined(BN_UMULT_LOHI)
# define mul_add(r,a,w,c) { \
BN_ULONG high,low,ret,tmp=(a); \
ret = (r); \
BN_UMULT_LOHI(low,high,w,tmp); \
ret += (c); \
(c) = (ret<(c))?1:0; \
(c) += high; \
ret += low; \
(c) += (ret<low)?1:0; \
(r) = ret; \
}
# define mul(r,a,w,c) { \
BN_ULONG high,low,ret,ta=(a); \
BN_UMULT_LOHI(low,high,w,ta); \
ret = low + (c); \
(c) = high; \
(c) += (ret<low)?1:0; \
(r) = ret; \
}
# define sqr(r0,r1,a) { \
BN_ULONG tmp=(a); \
BN_UMULT_LOHI(r0,r1,tmp,tmp); \
}
# elif defined(BN_UMULT_HIGH)
# define mul_add(r,a,w,c) { \
BN_ULONG high,low,ret,tmp=(a); \
ret = (r); \
high= BN_UMULT_HIGH(w,tmp); \
ret += (c); \
low = (w) * tmp; \
(c) = (ret<(c))?1:0; \
(c) += high; \
ret += low; \
(c) += (ret<low)?1:0; \
(r) = ret; \
}
# define mul(r,a,w,c) { \
BN_ULONG high,low,ret,ta=(a); \
low = (w) * ta; \
high= BN_UMULT_HIGH(w,ta); \
ret = low + (c); \
(c) = high; \
(c) += (ret<low)?1:0; \
(r) = ret; \
}
# define sqr(r0,r1,a) { \
BN_ULONG tmp=(a); \
(r0) = tmp * tmp; \
(r1) = BN_UMULT_HIGH(tmp,tmp); \
}
# else
/*************************************************************
* No long long type
*/
# define LBITS(a) ((a)&BN_MASK2l)
# define HBITS(a) (((a)>>BN_BITS4)&BN_MASK2l)
# define L2HBITS(a) (((a)<<BN_BITS4)&BN_MASK2)
# define LLBITS(a) ((a)&BN_MASKl)
# define LHBITS(a) (((a)>>BN_BITS2)&BN_MASKl)
# define LL2HBITS(a) ((BN_ULLONG)((a)&BN_MASKl)<<BN_BITS2)
# define mul64(l,h,bl,bh) \
{ \
BN_ULONG m,m1,lt,ht; \
\
lt=l; \
ht=h; \
m =(bh)*(lt); \
lt=(bl)*(lt); \
m1=(bl)*(ht); \
ht =(bh)*(ht); \
m=(m+m1)&BN_MASK2; if (m < m1) ht+=L2HBITS((BN_ULONG)1); \
ht+=HBITS(m); \
m1=L2HBITS(m); \
lt=(lt+m1)&BN_MASK2; if (lt < m1) ht++; \
(l)=lt; \
(h)=ht; \
}
# define sqr64(lo,ho,in) \
{ \
BN_ULONG l,h,m; \
\
h=(in); \
l=LBITS(h); \
h=HBITS(h); \
m =(l)*(h); \
l*=l; \
h*=h; \
h+=(m&BN_MASK2h1)>>(BN_BITS4-1); \
m =(m&BN_MASK2l)<<(BN_BITS4+1); \
l=(l+m)&BN_MASK2; if (l < m) h++; \
(lo)=l; \
(ho)=h; \
}
# define mul_add(r,a,bl,bh,c) { \
BN_ULONG l,h; \
\
h= (a); \
l=LBITS(h); \
h=HBITS(h); \
mul64(l,h,(bl),(bh)); \
\
/* non-multiply part */ \
l=(l+(c))&BN_MASK2; if (l < (c)) h++; \
(c)=(r); \
l=(l+(c))&BN_MASK2; if (l < (c)) h++; \
(c)=h&BN_MASK2; \
(r)=l; \
}
# define mul(r,a,bl,bh,c) { \
BN_ULONG l,h; \
\
h= (a); \
l=LBITS(h); \
h=HBITS(h); \
mul64(l,h,(bl),(bh)); \
\
/* non-multiply part */ \
l+=(c); if ((l&BN_MASK2) < (c)) h++; \
(c)=h&BN_MASK2; \
(r)=l&BN_MASK2; \
}
# endif /* !BN_LLONG */
# if defined(OPENSSL_DOING_MAKEDEPEND) && defined(OPENSSL_FIPS)
# undef bn_div_words
# endif
void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb);
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp);
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n);
int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl);
void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
int dna, int dnb, BN_ULONG *t);
void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b,
int n, int tna, int tnb, BN_ULONG *t);
void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t);
void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n);
void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
BN_ULONG *t);
void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2,
BN_ULONG *t);
BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
int cl, int dl);
BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
int cl, int dl);
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,916 @@
/* crypto/bn/bn_lib.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#ifndef BN_DEBUG
# undef NDEBUG /* avoid conflicting definitions */
# define NDEBUG
#endif
#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include "cryptlib.h"
#include "bn_lcl.h"
const char BN_version[] = "Big Number" OPENSSL_VERSION_PTEXT;
/* This stuff appears to be completely unused, so is deprecated */
#ifndef OPENSSL_NO_DEPRECATED
/*-
* For a 32 bit machine
* 2 - 4 == 128
* 3 - 8 == 256
* 4 - 16 == 512
* 5 - 32 == 1024
* 6 - 64 == 2048
* 7 - 128 == 4096
* 8 - 256 == 8192
*/
static int bn_limit_bits = 0;
static int bn_limit_num = 8; /* (1<<bn_limit_bits) */
static int bn_limit_bits_low = 0;
static int bn_limit_num_low = 8; /* (1<<bn_limit_bits_low) */
static int bn_limit_bits_high = 0;
static int bn_limit_num_high = 8; /* (1<<bn_limit_bits_high) */
static int bn_limit_bits_mont = 0;
static int bn_limit_num_mont = 8; /* (1<<bn_limit_bits_mont) */
void BN_set_params(int mult, int high, int low, int mont)
{
if (mult >= 0) {
if (mult > (int)(sizeof(int) * 8) - 1)
mult = sizeof(int) * 8 - 1;
bn_limit_bits = mult;
bn_limit_num = 1 << mult;
}
if (high >= 0) {
if (high > (int)(sizeof(int) * 8) - 1)
high = sizeof(int) * 8 - 1;
bn_limit_bits_high = high;
bn_limit_num_high = 1 << high;
}
if (low >= 0) {
if (low > (int)(sizeof(int) * 8) - 1)
low = sizeof(int) * 8 - 1;
bn_limit_bits_low = low;
bn_limit_num_low = 1 << low;
}
if (mont >= 0) {
if (mont > (int)(sizeof(int) * 8) - 1)
mont = sizeof(int) * 8 - 1;
bn_limit_bits_mont = mont;
bn_limit_num_mont = 1 << mont;
}
}
int BN_get_params(int which)
{
if (which == 0)
return (bn_limit_bits);
else if (which == 1)
return (bn_limit_bits_high);
else if (which == 2)
return (bn_limit_bits_low);
else if (which == 3)
return (bn_limit_bits_mont);
else
return (0);
}
#endif
const BIGNUM *BN_value_one(void)
{
static const BN_ULONG data_one = 1L;
static const BIGNUM const_one =
{ (BN_ULONG *)&data_one, 1, 1, 0, BN_FLG_STATIC_DATA };
return (&const_one);
}
int BN_num_bits_word(BN_ULONG l)
{
static const unsigned char bits[256] = {
0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
};
#if defined(SIXTY_FOUR_BIT_LONG)
if (l & 0xffffffff00000000L) {
if (l & 0xffff000000000000L) {
if (l & 0xff00000000000000L) {
return (bits[(int)(l >> 56)] + 56);
} else
return (bits[(int)(l >> 48)] + 48);
} else {
if (l & 0x0000ff0000000000L) {
return (bits[(int)(l >> 40)] + 40);
} else
return (bits[(int)(l >> 32)] + 32);
}
} else
#else
# ifdef SIXTY_FOUR_BIT
if (l & 0xffffffff00000000LL) {
if (l & 0xffff000000000000LL) {
if (l & 0xff00000000000000LL) {
return (bits[(int)(l >> 56)] + 56);
} else
return (bits[(int)(l >> 48)] + 48);
} else {
if (l & 0x0000ff0000000000LL) {
return (bits[(int)(l >> 40)] + 40);
} else
return (bits[(int)(l >> 32)] + 32);
}
} else
# endif
#endif
{
#if defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
if (l & 0xffff0000L) {
if (l & 0xff000000L)
return (bits[(int)(l >> 24L)] + 24);
else
return (bits[(int)(l >> 16L)] + 16);
} else
#endif
{
#if defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
if (l & 0xff00L)
return (bits[(int)(l >> 8)] + 8);
else
#endif
return (bits[(int)(l)]);
}
}
}
int BN_num_bits(const BIGNUM *a)
{
int i = a->top - 1;
bn_check_top(a);
if (BN_is_zero(a))
return 0;
return ((i * BN_BITS2) + BN_num_bits_word(a->d[i]));
}
void BN_clear_free(BIGNUM *a)
{
int i;
if (a == NULL)
return;
bn_check_top(a);
if (a->d != NULL) {
OPENSSL_cleanse(a->d, a->dmax * sizeof(a->d[0]));
if (!(BN_get_flags(a, BN_FLG_STATIC_DATA)))
OPENSSL_free(a->d);
}
i = BN_get_flags(a, BN_FLG_MALLOCED);
OPENSSL_cleanse(a, sizeof(BIGNUM));
if (i)
OPENSSL_free(a);
}
void BN_free(BIGNUM *a)
{
if (a == NULL)
return;
bn_check_top(a);
if ((a->d != NULL) && !(BN_get_flags(a, BN_FLG_STATIC_DATA)))
OPENSSL_free(a->d);
if (a->flags & BN_FLG_MALLOCED)
OPENSSL_free(a);
else {
#ifndef OPENSSL_NO_DEPRECATED
a->flags |= BN_FLG_FREE;
#endif
a->d = NULL;
}
}
void BN_init(BIGNUM *a)
{
memset(a, 0, sizeof(BIGNUM));
bn_check_top(a);
}
BIGNUM *BN_new(void)
{
BIGNUM *ret;
if ((ret = (BIGNUM *)OPENSSL_malloc(sizeof(BIGNUM))) == NULL) {
BNerr(BN_F_BN_NEW, ERR_R_MALLOC_FAILURE);
return (NULL);
}
ret->flags = BN_FLG_MALLOCED;
ret->top = 0;
ret->neg = 0;
ret->dmax = 0;
ret->d = NULL;
bn_check_top(ret);
return (ret);
}
/* This is used both by bn_expand2() and bn_dup_expand() */
/* The caller MUST check that words > b->dmax before calling this */
static BN_ULONG *bn_expand_internal(const BIGNUM *b, int words)
{
BN_ULONG *A, *a = NULL;
const BN_ULONG *B;
int i;
bn_check_top(b);
if (words > (INT_MAX / (4 * BN_BITS2))) {
BNerr(BN_F_BN_EXPAND_INTERNAL, BN_R_BIGNUM_TOO_LONG);
return NULL;
}
if (BN_get_flags(b, BN_FLG_STATIC_DATA)) {
BNerr(BN_F_BN_EXPAND_INTERNAL, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA);
return (NULL);
}
a = A = (BN_ULONG *)OPENSSL_malloc(sizeof(BN_ULONG) * words);
if (A == NULL) {
BNerr(BN_F_BN_EXPAND_INTERNAL, ERR_R_MALLOC_FAILURE);
return (NULL);
}
#ifdef PURIFY
/*
* Valgrind complains in BN_consttime_swap because we process the whole
* array even if it's not initialised yet. This doesn't matter in that
* function - what's important is constant time operation (we're not
* actually going to use the data)
*/
memset(a, 0, sizeof(BN_ULONG) * words);
#endif
#if 1
B = b->d;
/* Check if the previous number needs to be copied */
if (B != NULL) {
for (i = b->top >> 2; i > 0; i--, A += 4, B += 4) {
/*
* The fact that the loop is unrolled
* 4-wise is a tribute to Intel. It's
* the one that doesn't have enough
* registers to accomodate more data.
* I'd unroll it 8-wise otherwise:-)
*
* <appro@fy.chalmers.se>
*/
BN_ULONG a0, a1, a2, a3;
a0 = B[0];
a1 = B[1];
a2 = B[2];
a3 = B[3];
A[0] = a0;
A[1] = a1;
A[2] = a2;
A[3] = a3;
}
/*
* workaround for ultrix cc: without 'case 0', the optimizer does
* the switch table by doing a=top&3; a--; goto jump_table[a];
* which fails for top== 0
*/
switch (b->top & 3) {
case 3:
A[2] = B[2];
case 2:
A[1] = B[1];
case 1:
A[0] = B[0];
case 0:
;
}
}
#else
memset(A, 0, sizeof(BN_ULONG) * words);
memcpy(A, b->d, sizeof(b->d[0]) * b->top);
#endif
return (a);
}
/*
* This is an internal function that can be used instead of bn_expand2() when
* there is a need to copy BIGNUMs instead of only expanding the data part,
* while still expanding them. Especially useful when needing to expand
* BIGNUMs that are declared 'const' and should therefore not be changed. The
* reason to use this instead of a BN_dup() followed by a bn_expand2() is
* memory allocation overhead. A BN_dup() followed by a bn_expand2() will
* allocate new memory for the BIGNUM data twice, and free it once, while
* bn_dup_expand() makes sure allocation is made only once.
*/
#ifndef OPENSSL_NO_DEPRECATED
BIGNUM *bn_dup_expand(const BIGNUM *b, int words)
{
BIGNUM *r = NULL;
bn_check_top(b);
/*
* This function does not work if words <= b->dmax && top < words because
* BN_dup() does not preserve 'dmax'! (But bn_dup_expand() is not used
* anywhere yet.)
*/
if (words > b->dmax) {
BN_ULONG *a = bn_expand_internal(b, words);
if (a) {
r = BN_new();
if (r) {
r->top = b->top;
r->dmax = words;
r->neg = b->neg;
r->d = a;
} else {
/* r == NULL, BN_new failure */
OPENSSL_free(a);
}
}
/*
* If a == NULL, there was an error in allocation in
* bn_expand_internal(), and NULL should be returned
*/
} else {
r = BN_dup(b);
}
bn_check_top(r);
return r;
}
#endif
/*
* This is an internal function that should not be used in applications. It
* ensures that 'b' has enough room for a 'words' word number and initialises
* any unused part of b->d with leading zeros. It is mostly used by the
* various BIGNUM routines. If there is an error, NULL is returned. If not,
* 'b' is returned.
*/
BIGNUM *bn_expand2(BIGNUM *b, int words)
{
bn_check_top(b);
if (words > b->dmax) {
BN_ULONG *a = bn_expand_internal(b, words);
if (!a)
return NULL;
if (b->d)
OPENSSL_free(b->d);
b->d = a;
b->dmax = words;
}
/* None of this should be necessary because of what b->top means! */
#if 0
/*
* NB: bn_wexpand() calls this only if the BIGNUM really has to grow
*/
if (b->top < b->dmax) {
int i;
BN_ULONG *A = &(b->d[b->top]);
for (i = (b->dmax - b->top) >> 3; i > 0; i--, A += 8) {
A[0] = 0;
A[1] = 0;
A[2] = 0;
A[3] = 0;
A[4] = 0;
A[5] = 0;
A[6] = 0;
A[7] = 0;
}
for (i = (b->dmax - b->top) & 7; i > 0; i--, A++)
A[0] = 0;
assert(A == &(b->d[b->dmax]));
}
#endif
bn_check_top(b);
return b;
}
BIGNUM *BN_dup(const BIGNUM *a)
{
BIGNUM *t;
if (a == NULL)
return NULL;
bn_check_top(a);
t = BN_new();
if (t == NULL)
return NULL;
if (!BN_copy(t, a)) {
BN_free(t);
return NULL;
}
bn_check_top(t);
return t;
}
BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b)
{
int i;
BN_ULONG *A;
const BN_ULONG *B;
bn_check_top(b);
if (a == b)
return (a);
if (bn_wexpand(a, b->top) == NULL)
return (NULL);
#if 1
A = a->d;
B = b->d;
for (i = b->top >> 2; i > 0; i--, A += 4, B += 4) {
BN_ULONG a0, a1, a2, a3;
a0 = B[0];
a1 = B[1];
a2 = B[2];
a3 = B[3];
A[0] = a0;
A[1] = a1;
A[2] = a2;
A[3] = a3;
}
/* ultrix cc workaround, see comments in bn_expand_internal */
switch (b->top & 3) {
case 3:
A[2] = B[2];
case 2:
A[1] = B[1];
case 1:
A[0] = B[0];
case 0:;
}
#else
memcpy(a->d, b->d, sizeof(b->d[0]) * b->top);
#endif
a->top = b->top;
a->neg = b->neg;
bn_check_top(a);
return (a);
}
void BN_swap(BIGNUM *a, BIGNUM *b)
{
int flags_old_a, flags_old_b;
BN_ULONG *tmp_d;
int tmp_top, tmp_dmax, tmp_neg;
bn_check_top(a);
bn_check_top(b);
flags_old_a = a->flags;
flags_old_b = b->flags;
tmp_d = a->d;
tmp_top = a->top;
tmp_dmax = a->dmax;
tmp_neg = a->neg;
a->d = b->d;
a->top = b->top;
a->dmax = b->dmax;
a->neg = b->neg;
b->d = tmp_d;
b->top = tmp_top;
b->dmax = tmp_dmax;
b->neg = tmp_neg;
a->flags =
(flags_old_a & BN_FLG_MALLOCED) | (flags_old_b & BN_FLG_STATIC_DATA);
b->flags =
(flags_old_b & BN_FLG_MALLOCED) | (flags_old_a & BN_FLG_STATIC_DATA);
bn_check_top(a);
bn_check_top(b);
}
void BN_clear(BIGNUM *a)
{
bn_check_top(a);
if (a->d != NULL)
memset(a->d, 0, a->dmax * sizeof(a->d[0]));
a->top = 0;
a->neg = 0;
}
BN_ULONG BN_get_word(const BIGNUM *a)
{
if (a->top > 1)
return BN_MASK2;
else if (a->top == 1)
return a->d[0];
/* a->top == 0 */
return 0;
}
int BN_set_word(BIGNUM *a, BN_ULONG w)
{
bn_check_top(a);
if (bn_expand(a, (int)sizeof(BN_ULONG) * 8) == NULL)
return (0);
a->neg = 0;
a->d[0] = w;
a->top = (w ? 1 : 0);
bn_check_top(a);
return (1);
}
BIGNUM *BN_bin2bn(const unsigned char *s, int len, BIGNUM *ret)
{
unsigned int i, m;
unsigned int n;
BN_ULONG l;
BIGNUM *bn = NULL;
if (ret == NULL)
ret = bn = BN_new();
if (ret == NULL)
return (NULL);
bn_check_top(ret);
l = 0;
n = len;
if (n == 0) {
ret->top = 0;
return (ret);
}
i = ((n - 1) / BN_BYTES) + 1;
m = ((n - 1) % (BN_BYTES));
if (bn_wexpand(ret, (int)i) == NULL) {
if (bn)
BN_free(bn);
return NULL;
}
ret->top = i;
ret->neg = 0;
while (n--) {
l = (l << 8L) | *(s++);
if (m-- == 0) {
ret->d[--i] = l;
l = 0;
m = BN_BYTES - 1;
}
}
/*
* need to call this due to clear byte at top if avoiding having the top
* bit set (-ve number)
*/
bn_correct_top(ret);
return (ret);
}
/* ignore negative */
int BN_bn2bin(const BIGNUM *a, unsigned char *to)
{
int n, i;
BN_ULONG l;
bn_check_top(a);
n = i = BN_num_bytes(a);
while (i--) {
l = a->d[i / BN_BYTES];
*(to++) = (unsigned char)(l >> (8 * (i % BN_BYTES))) & 0xff;
}
return (n);
}
int BN_ucmp(const BIGNUM *a, const BIGNUM *b)
{
int i;
BN_ULONG t1, t2, *ap, *bp;
bn_check_top(a);
bn_check_top(b);
i = a->top - b->top;
if (i != 0)
return (i);
ap = a->d;
bp = b->d;
for (i = a->top - 1; i >= 0; i--) {
t1 = ap[i];
t2 = bp[i];
if (t1 != t2)
return ((t1 > t2) ? 1 : -1);
}
return (0);
}
int BN_cmp(const BIGNUM *a, const BIGNUM *b)
{
int i;
int gt, lt;
BN_ULONG t1, t2;
if ((a == NULL) || (b == NULL)) {
if (a != NULL)
return (-1);
else if (b != NULL)
return (1);
else
return (0);
}
bn_check_top(a);
bn_check_top(b);
if (a->neg != b->neg) {
if (a->neg)
return (-1);
else
return (1);
}
if (a->neg == 0) {
gt = 1;
lt = -1;
} else {
gt = -1;
lt = 1;
}
if (a->top > b->top)
return (gt);
if (a->top < b->top)
return (lt);
for (i = a->top - 1; i >= 0; i--) {
t1 = a->d[i];
t2 = b->d[i];
if (t1 > t2)
return (gt);
if (t1 < t2)
return (lt);
}
return (0);
}
int BN_set_bit(BIGNUM *a, int n)
{
int i, j, k;
if (n < 0)
return 0;
i = n / BN_BITS2;
j = n % BN_BITS2;
if (a->top <= i) {
if (bn_wexpand(a, i + 1) == NULL)
return (0);
for (k = a->top; k < i + 1; k++)
a->d[k] = 0;
a->top = i + 1;
}
a->d[i] |= (((BN_ULONG)1) << j);
bn_check_top(a);
return (1);
}
int BN_clear_bit(BIGNUM *a, int n)
{
int i, j;
bn_check_top(a);
if (n < 0)
return 0;
i = n / BN_BITS2;
j = n % BN_BITS2;
if (a->top <= i)
return (0);
a->d[i] &= (~(((BN_ULONG)1) << j));
bn_correct_top(a);
return (1);
}
int BN_is_bit_set(const BIGNUM *a, int n)
{
int i, j;
bn_check_top(a);
if (n < 0)
return 0;
i = n / BN_BITS2;
j = n % BN_BITS2;
if (a->top <= i)
return 0;
return (int)(((a->d[i]) >> j) & ((BN_ULONG)1));
}
int BN_mask_bits(BIGNUM *a, int n)
{
int b, w;
bn_check_top(a);
if (n < 0)
return 0;
w = n / BN_BITS2;
b = n % BN_BITS2;
if (w >= a->top)
return 0;
if (b == 0)
a->top = w;
else {
a->top = w + 1;
a->d[w] &= ~(BN_MASK2 << b);
}
bn_correct_top(a);
return (1);
}
void BN_set_negative(BIGNUM *a, int b)
{
if (b && !BN_is_zero(a))
a->neg = 1;
else
a->neg = 0;
}
int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n)
{
int i;
BN_ULONG aa, bb;
aa = a[n - 1];
bb = b[n - 1];
if (aa != bb)
return ((aa > bb) ? 1 : -1);
for (i = n - 2; i >= 0; i--) {
aa = a[i];
bb = b[i];
if (aa != bb)
return ((aa > bb) ? 1 : -1);
}
return (0);
}
/*
* Here follows a specialised variants of bn_cmp_words(). It has the
* property of performing the operation on arrays of different sizes. The
* sizes of those arrays is expressed through cl, which is the common length
* ( basicall, min(len(a),len(b)) ), and dl, which is the delta between the
* two lengths, calculated as len(a)-len(b). All lengths are the number of
* BN_ULONGs...
*/
int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl)
{
int n, i;
n = cl - 1;
if (dl < 0) {
for (i = dl; i < 0; i++) {
if (b[n - i] != 0)
return -1; /* a < b */
}
}
if (dl > 0) {
for (i = dl; i > 0; i--) {
if (a[n + i] != 0)
return 1; /* a > b */
}
}
return bn_cmp_words(a, b, cl);
}
/*
* Constant-time conditional swap of a and b.
* a and b are swapped if condition is not 0. The code assumes that at most one bit of condition is set.
* nwords is the number of words to swap. The code assumes that at least nwords are allocated in both a and b,
* and that no more than nwords are used by either a or b.
* a and b cannot be the same number
*/
void BN_consttime_swap(BN_ULONG condition, BIGNUM *a, BIGNUM *b, int nwords)
{
BN_ULONG t;
int i;
bn_wcheck_size(a, nwords);
bn_wcheck_size(b, nwords);
assert(a != b);
assert((condition & (condition - 1)) == 0);
assert(sizeof(BN_ULONG) >= sizeof(int));
condition = ((condition - 1) >> (BN_BITS2 - 1)) - 1;
t = (a->top ^ b->top) & condition;
a->top ^= t;
b->top ^= t;
#define BN_CONSTTIME_SWAP(ind) \
do { \
t = (a->d[ind] ^ b->d[ind]) & condition; \
a->d[ind] ^= t; \
b->d[ind] ^= t; \
} while (0)
switch (nwords) {
default:
for (i = 10; i < nwords; i++)
BN_CONSTTIME_SWAP(i);
/* Fallthrough */
case 10:
BN_CONSTTIME_SWAP(9); /* Fallthrough */
case 9:
BN_CONSTTIME_SWAP(8); /* Fallthrough */
case 8:
BN_CONSTTIME_SWAP(7); /* Fallthrough */
case 7:
BN_CONSTTIME_SWAP(6); /* Fallthrough */
case 6:
BN_CONSTTIME_SWAP(5); /* Fallthrough */
case 5:
BN_CONSTTIME_SWAP(4); /* Fallthrough */
case 4:
BN_CONSTTIME_SWAP(3); /* Fallthrough */
case 3:
BN_CONSTTIME_SWAP(2); /* Fallthrough */
case 2:
BN_CONSTTIME_SWAP(1); /* Fallthrough */
case 1:
BN_CONSTTIME_SWAP(0);
}
#undef BN_CONSTTIME_SWAP
}

Binary file not shown.

View File

@@ -0,0 +1,316 @@
/* crypto/bn/bn_mod.c */
/*
* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
* for the OpenSSL project.
*/
/* ====================================================================
* Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include "cryptlib.h"
#include "bn_lcl.h"
#if 0 /* now just a #define */
int BN_mod(BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx)
{
return (BN_div(NULL, rem, m, d, ctx));
/* note that rem->neg == m->neg (unless the remainder is zero) */
}
#endif
int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx)
{
/*
* like BN_mod, but returns non-negative remainder (i.e., 0 <= r < |d|
* always holds)
*/
if (!(BN_mod(r, m, d, ctx)))
return 0;
if (!r->neg)
return 1;
/* now -|d| < r < 0, so we have to set r := r + |d| */
return (d->neg ? BN_sub : BN_add) (r, r, d);
}
int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx)
{
if (!BN_add(r, a, b))
return 0;
return BN_nnmod(r, r, m, ctx);
}
/*
* BN_mod_add variant that may be used if both a and b are non-negative and
* less than m
*/
int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m)
{
if (!BN_uadd(r, a, b))
return 0;
if (BN_ucmp(r, m) >= 0)
return BN_usub(r, r, m);
return 1;
}
int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx)
{
if (!BN_sub(r, a, b))
return 0;
return BN_nnmod(r, r, m, ctx);
}
/*
* BN_mod_sub variant that may be used if both a and b are non-negative and
* less than m
*/
int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m)
{
if (!BN_sub(r, a, b))
return 0;
if (r->neg)
return BN_add(r, r, m);
return 1;
}
/* slow but works */
int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx)
{
BIGNUM *t;
int ret = 0;
bn_check_top(a);
bn_check_top(b);
bn_check_top(m);
BN_CTX_start(ctx);
if ((t = BN_CTX_get(ctx)) == NULL)
goto err;
if (a == b) {
if (!BN_sqr(t, a, ctx))
goto err;
} else {
if (!BN_mul(t, a, b, ctx))
goto err;
}
if (!BN_nnmod(r, t, m, ctx))
goto err;
bn_check_top(r);
ret = 1;
err:
BN_CTX_end(ctx);
return (ret);
}
int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
{
if (!BN_sqr(r, a, ctx))
return 0;
/* r->neg == 0, thus we don't need BN_nnmod */
return BN_mod(r, r, m, ctx);
}
int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
{
if (!BN_lshift1(r, a))
return 0;
bn_check_top(r);
return BN_nnmod(r, r, m, ctx);
}
/*
* BN_mod_lshift1 variant that may be used if a is non-negative and less than
* m
*/
int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m)
{
if (!BN_lshift1(r, a))
return 0;
bn_check_top(r);
if (BN_cmp(r, m) >= 0)
return BN_sub(r, r, m);
return 1;
}
int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
BN_CTX *ctx)
{
BIGNUM *abs_m = NULL;
int ret;
if (!BN_nnmod(r, a, m, ctx))
return 0;
if (m->neg) {
abs_m = BN_dup(m);
if (abs_m == NULL)
return 0;
abs_m->neg = 0;
}
ret = BN_mod_lshift_quick(r, r, n, (abs_m ? abs_m : m));
bn_check_top(r);
if (abs_m)
BN_free(abs_m);
return ret;
}
/*
* BN_mod_lshift variant that may be used if a is non-negative and less than
* m
*/
int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m)
{
if (r != a) {
if (BN_copy(r, a) == NULL)
return 0;
}
while (n > 0) {
int max_shift;
/* 0 < r < m */
max_shift = BN_num_bits(m) - BN_num_bits(r);
/* max_shift >= 0 */
if (max_shift < 0) {
BNerr(BN_F_BN_MOD_LSHIFT_QUICK, BN_R_INPUT_NOT_REDUCED);
return 0;
}
if (max_shift > n)
max_shift = n;
if (max_shift) {
if (!BN_lshift(r, r, max_shift))
return 0;
n -= max_shift;
} else {
if (!BN_lshift1(r, r))
return 0;
--n;
}
/* BN_num_bits(r) <= BN_num_bits(m) */
if (BN_cmp(r, m) >= 0) {
if (!BN_sub(r, r, m))
return 0;
}
}
bn_check_top(r);
return 1;
}

Binary file not shown.

View File

@@ -0,0 +1,558 @@
/* crypto/bn/bn_mont.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* ====================================================================
* Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
/*
* Details about Montgomery multiplication algorithms can be found at
* http://security.ece.orst.edu/publications.html, e.g.
* http://security.ece.orst.edu/koc/papers/j37acmon.pdf and
* sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf
*/
#include <stdio.h>
#include "cryptlib.h"
#include "bn_lcl.h"
#define MONT_WORD /* use the faster word-based algorithm */
#ifdef MONT_WORD
static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
#endif
int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
BN_MONT_CTX *mont, BN_CTX *ctx)
{
BIGNUM *tmp;
int ret = 0;
#if defined(OPENSSL_BN_ASM_MONT) && defined(MONT_WORD)
int num = mont->N.top;
if (num > 1 && a->top == num && b->top == num) {
if (bn_wexpand(r, num) == NULL)
return (0);
if (bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
r->neg = a->neg ^ b->neg;
r->top = num;
bn_correct_top(r);
return (1);
}
}
#endif
BN_CTX_start(ctx);
tmp = BN_CTX_get(ctx);
if (tmp == NULL)
goto err;
bn_check_top(tmp);
if (a == b) {
if (!BN_sqr(tmp, a, ctx))
goto err;
} else {
if (!BN_mul(tmp, a, b, ctx))
goto err;
}
/* reduce from aRR to aR */
#ifdef MONT_WORD
if (!BN_from_montgomery_word(r, tmp, mont))
goto err;
#else
if (!BN_from_montgomery(r, tmp, mont, ctx))
goto err;
#endif
bn_check_top(r);
ret = 1;
err:
BN_CTX_end(ctx);
return (ret);
}
#ifdef MONT_WORD
static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
{
BIGNUM *n;
BN_ULONG *ap, *np, *rp, n0, v, carry;
int nl, max, i;
n = &(mont->N);
nl = n->top;
if (nl == 0) {
ret->top = 0;
return (1);
}
max = (2 * nl); /* carry is stored separately */
if (bn_wexpand(r, max) == NULL)
return (0);
r->neg ^= n->neg;
np = n->d;
rp = r->d;
/* clear the top words of T */
# if 1
for (i = r->top; i < max; i++) /* memset? XXX */
rp[i] = 0;
# else
memset(&(rp[r->top]), 0, (max - r->top) * sizeof(BN_ULONG));
# endif
r->top = max;
n0 = mont->n0[0];
# ifdef BN_COUNT
fprintf(stderr, "word BN_from_montgomery_word %d * %d\n", nl, nl);
# endif
for (carry = 0, i = 0; i < nl; i++, rp++) {
# ifdef __TANDEM
{
long long t1;
long long t2;
long long t3;
t1 = rp[0] * (n0 & 0177777);
t2 = 037777600000l;
t2 = n0 & t2;
t3 = rp[0] & 0177777;
t2 = (t3 * t2) & BN_MASK2;
t1 = t1 + t2;
v = bn_mul_add_words(rp, np, nl, (BN_ULONG)t1);
}
# else
v = bn_mul_add_words(rp, np, nl, (rp[0] * n0) & BN_MASK2);
# endif
v = (v + carry + rp[nl]) & BN_MASK2;
carry |= (v != rp[nl]);
carry &= (v <= rp[nl]);
rp[nl] = v;
}
if (bn_wexpand(ret, nl) == NULL)
return (0);
ret->top = nl;
ret->neg = r->neg;
rp = ret->d;
ap = &(r->d[nl]);
# define BRANCH_FREE 1
# if BRANCH_FREE
{
BN_ULONG *nrp;
size_t m;
v = bn_sub_words(rp, ap, np, nl) - carry;
/*
* if subtraction result is real, then trick unconditional memcpy
* below to perform in-place "refresh" instead of actual copy.
*/
m = (0 - (size_t)v);
nrp =
(BN_ULONG *)(((PTR_SIZE_INT) rp & ~m) | ((PTR_SIZE_INT) ap & m));
for (i = 0, nl -= 4; i < nl; i += 4) {
BN_ULONG t1, t2, t3, t4;
t1 = nrp[i + 0];
t2 = nrp[i + 1];
t3 = nrp[i + 2];
ap[i + 0] = 0;
t4 = nrp[i + 3];
ap[i + 1] = 0;
rp[i + 0] = t1;
ap[i + 2] = 0;
rp[i + 1] = t2;
ap[i + 3] = 0;
rp[i + 2] = t3;
rp[i + 3] = t4;
}
for (nl += 4; i < nl; i++)
rp[i] = nrp[i], ap[i] = 0;
}
# else
if (bn_sub_words(rp, ap, np, nl) - carry)
memcpy(rp, ap, nl * sizeof(BN_ULONG));
# endif
bn_correct_top(r);
bn_correct_top(ret);
bn_check_top(ret);
return (1);
}
#endif /* MONT_WORD */
int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
BN_CTX *ctx)
{
int retn = 0;
#ifdef MONT_WORD
BIGNUM *t;
BN_CTX_start(ctx);
if ((t = BN_CTX_get(ctx)) && BN_copy(t, a))
retn = BN_from_montgomery_word(ret, t, mont);
BN_CTX_end(ctx);
#else /* !MONT_WORD */
BIGNUM *t1, *t2;
BN_CTX_start(ctx);
t1 = BN_CTX_get(ctx);
t2 = BN_CTX_get(ctx);
if (t1 == NULL || t2 == NULL)
goto err;
if (!BN_copy(t1, a))
goto err;
BN_mask_bits(t1, mont->ri);
if (!BN_mul(t2, t1, &mont->Ni, ctx))
goto err;
BN_mask_bits(t2, mont->ri);
if (!BN_mul(t1, t2, &mont->N, ctx))
goto err;
if (!BN_add(t2, a, t1))
goto err;
if (!BN_rshift(ret, t2, mont->ri))
goto err;
if (BN_ucmp(ret, &(mont->N)) >= 0) {
if (!BN_usub(ret, ret, &(mont->N)))
goto err;
}
retn = 1;
bn_check_top(ret);
err:
BN_CTX_end(ctx);
#endif /* MONT_WORD */
return (retn);
}
BN_MONT_CTX *BN_MONT_CTX_new(void)
{
BN_MONT_CTX *ret;
if ((ret = (BN_MONT_CTX *)OPENSSL_malloc(sizeof(BN_MONT_CTX))) == NULL)
return (NULL);
BN_MONT_CTX_init(ret);
ret->flags = BN_FLG_MALLOCED;
return (ret);
}
void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
{
ctx->ri = 0;
BN_init(&(ctx->RR));
BN_init(&(ctx->N));
BN_init(&(ctx->Ni));
ctx->n0[0] = ctx->n0[1] = 0;
ctx->flags = 0;
}
void BN_MONT_CTX_free(BN_MONT_CTX *mont)
{
if (mont == NULL)
return;
BN_clear_free(&(mont->RR));
BN_clear_free(&(mont->N));
BN_clear_free(&(mont->Ni));
if (mont->flags & BN_FLG_MALLOCED)
OPENSSL_free(mont);
}
int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
{
int ret = 0;
BIGNUM *Ri, *R;
if (BN_is_zero(mod))
return 0;
BN_CTX_start(ctx);
if ((Ri = BN_CTX_get(ctx)) == NULL)
goto err;
R = &(mont->RR); /* grab RR as a temp */
if (!BN_copy(&(mont->N), mod))
goto err; /* Set N */
mont->N.neg = 0;
#ifdef MONT_WORD
{
BIGNUM tmod;
BN_ULONG buf[2];
BN_init(&tmod);
tmod.d = buf;
tmod.dmax = 2;
tmod.neg = 0;
mont->ri = (BN_num_bits(mod) + (BN_BITS2 - 1)) / BN_BITS2 * BN_BITS2;
# if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
/*
* Only certain BN_BITS2<=32 platforms actually make use of n0[1],
* and we could use the #else case (with a shorter R value) for the
* others. However, currently only the assembler files do know which
* is which.
*/
BN_zero(R);
if (!(BN_set_bit(R, 2 * BN_BITS2)))
goto err;
tmod.top = 0;
if ((buf[0] = mod->d[0]))
tmod.top = 1;
if ((buf[1] = mod->top > 1 ? mod->d[1] : 0))
tmod.top = 2;
if ((BN_mod_inverse(Ri, R, &tmod, ctx)) == NULL)
goto err;
if (!BN_lshift(Ri, Ri, 2 * BN_BITS2))
goto err; /* R*Ri */
if (!BN_is_zero(Ri)) {
if (!BN_sub_word(Ri, 1))
goto err;
} else { /* if N mod word size == 1 */
if (bn_expand(Ri, (int)sizeof(BN_ULONG) * 2) == NULL)
goto err;
/* Ri-- (mod double word size) */
Ri->neg = 0;
Ri->d[0] = BN_MASK2;
Ri->d[1] = BN_MASK2;
Ri->top = 2;
}
if (!BN_div(Ri, NULL, Ri, &tmod, ctx))
goto err;
/*
* Ni = (R*Ri-1)/N, keep only couple of least significant words:
*/
mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
# else
BN_zero(R);
if (!(BN_set_bit(R, BN_BITS2)))
goto err; /* R */
buf[0] = mod->d[0]; /* tmod = N mod word size */
buf[1] = 0;
tmod.top = buf[0] != 0 ? 1 : 0;
/* Ri = R^-1 mod N */
if ((BN_mod_inverse(Ri, R, &tmod, ctx)) == NULL)
goto err;
if (!BN_lshift(Ri, Ri, BN_BITS2))
goto err; /* R*Ri */
if (!BN_is_zero(Ri)) {
if (!BN_sub_word(Ri, 1))
goto err;
} else { /* if N mod word size == 1 */
if (!BN_set_word(Ri, BN_MASK2))
goto err; /* Ri-- (mod word size) */
}
if (!BN_div(Ri, NULL, Ri, &tmod, ctx))
goto err;
/*
* Ni = (R*Ri-1)/N, keep only least significant word:
*/
mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
mont->n0[1] = 0;
# endif
}
#else /* !MONT_WORD */
{ /* bignum version */
mont->ri = BN_num_bits(&mont->N);
BN_zero(R);
if (!BN_set_bit(R, mont->ri))
goto err; /* R = 2^ri */
/* Ri = R^-1 mod N */
if ((BN_mod_inverse(Ri, R, &mont->N, ctx)) == NULL)
goto err;
if (!BN_lshift(Ri, Ri, mont->ri))
goto err; /* R*Ri */
if (!BN_sub_word(Ri, 1))
goto err;
/*
* Ni = (R*Ri-1) / N
*/
if (!BN_div(&(mont->Ni), NULL, Ri, &mont->N, ctx))
goto err;
}
#endif
/* setup RR for conversions */
BN_zero(&(mont->RR));
if (!BN_set_bit(&(mont->RR), mont->ri * 2))
goto err;
if (!BN_mod(&(mont->RR), &(mont->RR), &(mont->N), ctx))
goto err;
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
{
if (to == from)
return (to);
if (!BN_copy(&(to->RR), &(from->RR)))
return NULL;
if (!BN_copy(&(to->N), &(from->N)))
return NULL;
if (!BN_copy(&(to->Ni), &(from->Ni)))
return NULL;
to->ri = from->ri;
to->n0[0] = from->n0[0];
to->n0[1] = from->n0[1];
return (to);
}
BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
const BIGNUM *mod, BN_CTX *ctx)
{
BN_MONT_CTX *ret;
CRYPTO_r_lock(lock);
ret = *pmont;
CRYPTO_r_unlock(lock);
if (ret)
return ret;
/*
* We don't want to serialise globally while doing our lazy-init math in
* BN_MONT_CTX_set. That punishes threads that are doing independent
* things. Instead, punish the case where more than one thread tries to
* lazy-init the same 'pmont', by having each do the lazy-init math work
* independently and only use the one from the thread that wins the race
* (the losers throw away the work they've done).
*/
ret = BN_MONT_CTX_new();
if (!ret)
return NULL;
if (!BN_MONT_CTX_set(ret, mod, ctx)) {
BN_MONT_CTX_free(ret);
return NULL;
}
/* The locked compare-and-set, after the local work is done. */
CRYPTO_w_lock(lock);
if (*pmont) {
BN_MONT_CTX_free(ret);
ret = *pmont;
} else
*pmont = ret;
CRYPTO_w_unlock(lock);
return ret;
}

Binary file not shown.

View File

@@ -0,0 +1,128 @@
/* crypto/bn/bn_mpi.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include "cryptlib.h"
#include "bn_lcl.h"
int BN_bn2mpi(const BIGNUM *a, unsigned char *d)
{
int bits;
int num = 0;
int ext = 0;
long l;
bits = BN_num_bits(a);
num = (bits + 7) / 8;
if (bits > 0) {
ext = ((bits & 0x07) == 0);
}
if (d == NULL)
return (num + 4 + ext);
l = num + ext;
d[0] = (unsigned char)(l >> 24) & 0xff;
d[1] = (unsigned char)(l >> 16) & 0xff;
d[2] = (unsigned char)(l >> 8) & 0xff;
d[3] = (unsigned char)(l) & 0xff;
if (ext)
d[4] = 0;
num = BN_bn2bin(a, &(d[4 + ext]));
if (a->neg)
d[4] |= 0x80;
return (num + 4 + ext);
}
BIGNUM *BN_mpi2bn(const unsigned char *d, int n, BIGNUM *a)
{
long len;
int neg = 0;
if (n < 4) {
BNerr(BN_F_BN_MPI2BN, BN_R_INVALID_LENGTH);
return (NULL);
}
len = ((long)d[0] << 24) | ((long)d[1] << 16) | ((int)d[2] << 8) | (int)
d[3];
if ((len + 4) != n) {
BNerr(BN_F_BN_MPI2BN, BN_R_ENCODING_ERROR);
return (NULL);
}
if (a == NULL)
a = BN_new();
if (a == NULL)
return (NULL);
if (len == 0) {
a->neg = 0;
a->top = 0;
return (a);
}
d += 4;
if ((*d) & 0x80)
neg = 1;
if (BN_bin2bn(d, (int)len, a) == NULL)
return (NULL);
a->neg = neg;
if (neg) {
BN_clear_bit(a, BN_num_bits(a) - 1);
}
bn_check_top(a);
return (a);
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -0,0 +1,515 @@
/* crypto/bn/bn_prime.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* ====================================================================
* Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
#include <stdio.h>
#include <time.h>
#include "cryptlib.h"
#include "bn_lcl.h"
#include <openssl/rand.h>
/*
* NB: these functions have been "upgraded", the deprecated versions (which
* are compatibility wrappers using these functions) are in bn_depr.c. -
* Geoff
*/
/*
* The quick sieve algorithm approach to weeding out primes is Philip
* Zimmermann's, as implemented in PGP. I have had a read of his comments
* and implemented my own version.
*/
#include "bn_prime.h"
static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
const BIGNUM *a1_odd, int k, BN_CTX *ctx,
BN_MONT_CTX *mont);
static int probable_prime(BIGNUM *rnd, int bits);
static int probable_prime_dh(BIGNUM *rnd, int bits,
const BIGNUM *add, const BIGNUM *rem,
BN_CTX *ctx);
static int probable_prime_dh_safe(BIGNUM *rnd, int bits, const BIGNUM *add,
const BIGNUM *rem, BN_CTX *ctx);
int BN_GENCB_call(BN_GENCB *cb, int a, int b)
{
/* No callback means continue */
if (!cb)
return 1;
switch (cb->ver) {
case 1:
/* Deprecated-style callbacks */
if (!cb->cb.cb_1)
return 1;
cb->cb.cb_1(a, b, cb->arg);
return 1;
case 2:
/* New-style callbacks */
return cb->cb.cb_2(a, b, cb);
default:
break;
}
/* Unrecognised callback type */
return 0;
}
int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe,
const BIGNUM *add, const BIGNUM *rem, BN_GENCB *cb)
{
BIGNUM *t;
int found = 0;
int i, j, c1 = 0;
BN_CTX *ctx;
int checks = BN_prime_checks_for_size(bits);
ctx = BN_CTX_new();
if (ctx == NULL)
goto err;
BN_CTX_start(ctx);
t = BN_CTX_get(ctx);
if (!t)
goto err;
loop:
/* make a random number and set the top and bottom bits */
if (add == NULL) {
if (!probable_prime(ret, bits))
goto err;
} else {
if (safe) {
if (!probable_prime_dh_safe(ret, bits, add, rem, ctx))
goto err;
} else {
if (!probable_prime_dh(ret, bits, add, rem, ctx))
goto err;
}
}
/* if (BN_mod_word(ret,(BN_ULONG)3) == 1) goto loop; */
if (!BN_GENCB_call(cb, 0, c1++))
/* aborted */
goto err;
if (!safe) {
i = BN_is_prime_fasttest_ex(ret, checks, ctx, 0, cb);
if (i == -1)
goto err;
if (i == 0)
goto loop;
} else {
/*
* for "safe prime" generation, check that (p-1)/2 is prime. Since a
* prime is odd, We just need to divide by 2
*/
if (!BN_rshift1(t, ret))
goto err;
for (i = 0; i < checks; i++) {
j = BN_is_prime_fasttest_ex(ret, 1, ctx, 0, cb);
if (j == -1)
goto err;
if (j == 0)
goto loop;
j = BN_is_prime_fasttest_ex(t, 1, ctx, 0, cb);
if (j == -1)
goto err;
if (j == 0)
goto loop;
if (!BN_GENCB_call(cb, 2, c1 - 1))
goto err;
/* We have a safe prime test pass */
}
}
/* we have a prime :-) */
found = 1;
err:
if (ctx != NULL) {
BN_CTX_end(ctx);
BN_CTX_free(ctx);
}
bn_check_top(ret);
return found;
}
int BN_is_prime_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
BN_GENCB *cb)
{
return BN_is_prime_fasttest_ex(a, checks, ctx_passed, 0, cb);
}
int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
int do_trial_division, BN_GENCB *cb)
{
int i, j, ret = -1;
int k;
BN_CTX *ctx = NULL;
BIGNUM *A1, *A1_odd, *check; /* taken from ctx */
BN_MONT_CTX *mont = NULL;
const BIGNUM *A = NULL;
if (BN_cmp(a, BN_value_one()) <= 0)
return 0;
if (checks == BN_prime_checks)
checks = BN_prime_checks_for_size(BN_num_bits(a));
/* first look for small factors */
if (!BN_is_odd(a))
/* a is even => a is prime if and only if a == 2 */
return BN_is_word(a, 2);
if (do_trial_division) {
for (i = 1; i < NUMPRIMES; i++)
if (BN_mod_word(a, primes[i]) == 0)
return 0;
if (!BN_GENCB_call(cb, 1, -1))
goto err;
}
if (ctx_passed != NULL)
ctx = ctx_passed;
else if ((ctx = BN_CTX_new()) == NULL)
goto err;
BN_CTX_start(ctx);
/* A := abs(a) */
if (a->neg) {
BIGNUM *t;
if ((t = BN_CTX_get(ctx)) == NULL)
goto err;
BN_copy(t, a);
t->neg = 0;
A = t;
} else
A = a;
A1 = BN_CTX_get(ctx);
A1_odd = BN_CTX_get(ctx);
check = BN_CTX_get(ctx);
if (check == NULL)
goto err;
/* compute A1 := A - 1 */
if (!BN_copy(A1, A))
goto err;
if (!BN_sub_word(A1, 1))
goto err;
if (BN_is_zero(A1)) {
ret = 0;
goto err;
}
/* write A1 as A1_odd * 2^k */
k = 1;
while (!BN_is_bit_set(A1, k))
k++;
if (!BN_rshift(A1_odd, A1, k))
goto err;
/* Montgomery setup for computations mod A */
mont = BN_MONT_CTX_new();
if (mont == NULL)
goto err;
if (!BN_MONT_CTX_set(mont, A, ctx))
goto err;
for (i = 0; i < checks; i++) {
if (!BN_pseudo_rand_range(check, A1))
goto err;
if (!BN_add_word(check, 1))
goto err;
/* now 1 <= check < A */
j = witness(check, A, A1, A1_odd, k, ctx, mont);
if (j == -1)
goto err;
if (j) {
ret = 0;
goto err;
}
if (!BN_GENCB_call(cb, 1, i))
goto err;
}
ret = 1;
err:
if (ctx != NULL) {
BN_CTX_end(ctx);
if (ctx_passed == NULL)
BN_CTX_free(ctx);
}
if (mont != NULL)
BN_MONT_CTX_free(mont);
return (ret);
}
static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
const BIGNUM *a1_odd, int k, BN_CTX *ctx,
BN_MONT_CTX *mont)
{
if (!BN_mod_exp_mont(w, w, a1_odd, a, ctx, mont)) /* w := w^a1_odd mod a */
return -1;
if (BN_is_one(w))
return 0; /* probably prime */
if (BN_cmp(w, a1) == 0)
return 0; /* w == -1 (mod a), 'a' is probably prime */
while (--k) {
if (!BN_mod_mul(w, w, w, a, ctx)) /* w := w^2 mod a */
return -1;
if (BN_is_one(w))
return 1; /* 'a' is composite, otherwise a previous 'w'
* would have been == -1 (mod 'a') */
if (BN_cmp(w, a1) == 0)
return 0; /* w == -1 (mod a), 'a' is probably prime */
}
/*
* If we get here, 'w' is the (a-1)/2-th power of the original 'w', and
* it is neither -1 nor +1 -- so 'a' cannot be prime
*/
bn_check_top(w);
return 1;
}
static int probable_prime(BIGNUM *rnd, int bits)
{
int i;
prime_t mods[NUMPRIMES];
BN_ULONG delta, maxdelta;
again:
if (!BN_rand(rnd, bits, 1, 1))
return (0);
/* we now have a random number 'rand' to test. */
for (i = 1; i < NUMPRIMES; i++)
mods[i] = (prime_t) BN_mod_word(rnd, (BN_ULONG)primes[i]);
maxdelta = BN_MASK2 - primes[NUMPRIMES - 1];
delta = 0;
loop:for (i = 1; i < NUMPRIMES; i++) {
/*
* check that rnd is not a prime and also that gcd(rnd-1,primes) == 1
* (except for 2)
*/
if (((mods[i] + delta) % primes[i]) <= 1) {
delta += 2;
if (delta > maxdelta)
goto again;
goto loop;
}
}
if (!BN_add_word(rnd, delta))
return (0);
bn_check_top(rnd);
return (1);
}
static int probable_prime_dh(BIGNUM *rnd, int bits,
const BIGNUM *add, const BIGNUM *rem,
BN_CTX *ctx)
{
int i, ret = 0;
BIGNUM *t1;
BN_CTX_start(ctx);
if ((t1 = BN_CTX_get(ctx)) == NULL)
goto err;
if (!BN_rand(rnd, bits, 0, 1))
goto err;
/* we need ((rnd-rem) % add) == 0 */
if (!BN_mod(t1, rnd, add, ctx))
goto err;
if (!BN_sub(rnd, rnd, t1))
goto err;
if (rem == NULL) {
if (!BN_add_word(rnd, 1))
goto err;
} else {
if (!BN_add(rnd, rnd, rem))
goto err;
}
/* we now have a random number 'rand' to test. */
loop:for (i = 1; i < NUMPRIMES; i++) {
/* check that rnd is a prime */
if (BN_mod_word(rnd, (BN_ULONG)primes[i]) <= 1) {
if (!BN_add(rnd, rnd, add))
goto err;
goto loop;
}
}
ret = 1;
err:
BN_CTX_end(ctx);
bn_check_top(rnd);
return (ret);
}
static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd,
const BIGNUM *rem, BN_CTX *ctx)
{
int i, ret = 0;
BIGNUM *t1, *qadd, *q;
bits--;
BN_CTX_start(ctx);
t1 = BN_CTX_get(ctx);
q = BN_CTX_get(ctx);
qadd = BN_CTX_get(ctx);
if (qadd == NULL)
goto err;
if (!BN_rshift1(qadd, padd))
goto err;
if (!BN_rand(q, bits, 0, 1))
goto err;
/* we need ((rnd-rem) % add) == 0 */
if (!BN_mod(t1, q, qadd, ctx))
goto err;
if (!BN_sub(q, q, t1))
goto err;
if (rem == NULL) {
if (!BN_add_word(q, 1))
goto err;
} else {
if (!BN_rshift1(t1, rem))
goto err;
if (!BN_add(q, q, t1))
goto err;
}
/* we now have a random number 'rand' to test. */
if (!BN_lshift1(p, q))
goto err;
if (!BN_add_word(p, 1))
goto err;
loop:for (i = 1; i < NUMPRIMES; i++) {
/* check that p and q are prime */
/*
* check that for p and q gcd(p-1,primes) == 1 (except for 2)
*/
if ((BN_mod_word(p, (BN_ULONG)primes[i]) == 0) ||
(BN_mod_word(q, (BN_ULONG)primes[i]) == 0)) {
if (!BN_add(p, p, padd))
goto err;
if (!BN_add(q, q, qadd))
goto err;
goto loop;
}
}
ret = 1;
err:
BN_CTX_end(ctx);
bn_check_top(p);
return (ret);
}

View File

@@ -0,0 +1,326 @@
/* Auto generated by bn_prime.pl */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#ifndef EIGHT_BIT
# define NUMPRIMES 2048
typedef unsigned short prime_t;
#else
# define NUMPRIMES 54
typedef unsigned char prime_t;
#endif
static const prime_t primes[NUMPRIMES] = {
2, 3, 5, 7, 11, 13, 17, 19,
23, 29, 31, 37, 41, 43, 47, 53,
59, 61, 67, 71, 73, 79, 83, 89,
97, 101, 103, 107, 109, 113, 127, 131,
137, 139, 149, 151, 157, 163, 167, 173,
179, 181, 191, 193, 197, 199, 211, 223,
227, 229, 233, 239, 241, 251,
#ifndef EIGHT_BIT
257, 263,
269, 271, 277, 281, 283, 293, 307, 311,
313, 317, 331, 337, 347, 349, 353, 359,
367, 373, 379, 383, 389, 397, 401, 409,
419, 421, 431, 433, 439, 443, 449, 457,
461, 463, 467, 479, 487, 491, 499, 503,
509, 521, 523, 541, 547, 557, 563, 569,
571, 577, 587, 593, 599, 601, 607, 613,
617, 619, 631, 641, 643, 647, 653, 659,
661, 673, 677, 683, 691, 701, 709, 719,
727, 733, 739, 743, 751, 757, 761, 769,
773, 787, 797, 809, 811, 821, 823, 827,
829, 839, 853, 857, 859, 863, 877, 881,
883, 887, 907, 911, 919, 929, 937, 941,
947, 953, 967, 971, 977, 983, 991, 997,
1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,
1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,
1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619,
1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693,
1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747,
1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811,
1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877,
1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949,
1951, 1973, 1979, 1987, 1993, 1997, 1999, 2003,
2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069,
2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129,
2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203,
2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267,
2269, 2273, 2281, 2287, 2293, 2297, 2309, 2311,
2333, 2339, 2341, 2347, 2351, 2357, 2371, 2377,
2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503,
2521, 2531, 2539, 2543, 2549, 2551, 2557, 2579,
2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657,
2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693,
2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741,
2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801,
2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861,
2879, 2887, 2897, 2903, 2909, 2917, 2927, 2939,
2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011,
3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167,
3169, 3181, 3187, 3191, 3203, 3209, 3217, 3221,
3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301,
3307, 3313, 3319, 3323, 3329, 3331, 3343, 3347,
3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491,
3499, 3511, 3517, 3527, 3529, 3533, 3539, 3541,
3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607,
3613, 3617, 3623, 3631, 3637, 3643, 3659, 3671,
3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727,
3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797,
3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863,
3877, 3881, 3889, 3907, 3911, 3917, 3919, 3923,
3929, 3931, 3943, 3947, 3967, 3989, 4001, 4003,
4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057,
4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129,
4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211,
4217, 4219, 4229, 4231, 4241, 4243, 4253, 4259,
4261, 4271, 4273, 4283, 4289, 4297, 4327, 4337,
4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481,
4483, 4493, 4507, 4513, 4517, 4519, 4523, 4547,
4549, 4561, 4567, 4583, 4591, 4597, 4603, 4621,
4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673,
4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751,
4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813,
4817, 4831, 4861, 4871, 4877, 4889, 4903, 4909,
4919, 4931, 4933, 4937, 4943, 4951, 4957, 4967,
4969, 4973, 4987, 4993, 4999, 5003, 5009, 5011,
5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087,
5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167,
5171, 5179, 5189, 5197, 5209, 5227, 5231, 5233,
5237, 5261, 5273, 5279, 5281, 5297, 5303, 5309,
5323, 5333, 5347, 5351, 5381, 5387, 5393, 5399,
5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507,
5519, 5521, 5527, 5531, 5557, 5563, 5569, 5573,
5581, 5591, 5623, 5639, 5641, 5647, 5651, 5653,
5657, 5659, 5669, 5683, 5689, 5693, 5701, 5711,
5717, 5737, 5741, 5743, 5749, 5779, 5783, 5791,
5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849,
5851, 5857, 5861, 5867, 5869, 5879, 5881, 5897,
5903, 5923, 5927, 5939, 5953, 5981, 5987, 6007,
6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073,
6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133,
6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211,
6217, 6221, 6229, 6247, 6257, 6263, 6269, 6271,
6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329,
6337, 6343, 6353, 6359, 6361, 6367, 6373, 6379,
6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563,
6569, 6571, 6577, 6581, 6599, 6607, 6619, 6637,
6653, 6659, 6661, 6673, 6679, 6689, 6691, 6701,
6703, 6709, 6719, 6733, 6737, 6761, 6763, 6779,
6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833,
6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907,
6911, 6917, 6947, 6949, 6959, 6961, 6967, 6971,
6977, 6983, 6991, 6997, 7001, 7013, 7019, 7027,
7039, 7043, 7057, 7069, 7079, 7103, 7109, 7121,
7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207,
7211, 7213, 7219, 7229, 7237, 7243, 7247, 7253,
7283, 7297, 7307, 7309, 7321, 7331, 7333, 7349,
7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457,
7459, 7477, 7481, 7487, 7489, 7499, 7507, 7517,
7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621,
7639, 7643, 7649, 7669, 7673, 7681, 7687, 7691,
7699, 7703, 7717, 7723, 7727, 7741, 7753, 7757,
7759, 7789, 7793, 7817, 7823, 7829, 7841, 7853,
7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919,
7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009,
8011, 8017, 8039, 8053, 8059, 8069, 8081, 8087,
8089, 8093, 8101, 8111, 8117, 8123, 8147, 8161,
8167, 8171, 8179, 8191, 8209, 8219, 8221, 8231,
8233, 8237, 8243, 8263, 8269, 8273, 8287, 8291,
8293, 8297, 8311, 8317, 8329, 8353, 8363, 8369,
8377, 8387, 8389, 8419, 8423, 8429, 8431, 8443,
8447, 8461, 8467, 8501, 8513, 8521, 8527, 8537,
8539, 8543, 8563, 8573, 8581, 8597, 8599, 8609,
8623, 8627, 8629, 8641, 8647, 8663, 8669, 8677,
8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731,
8737, 8741, 8747, 8753, 8761, 8779, 8783, 8803,
8807, 8819, 8821, 8831, 8837, 8839, 8849, 8861,
8863, 8867, 8887, 8893, 8923, 8929, 8933, 8941,
8951, 8963, 8969, 8971, 8999, 9001, 9007, 9011,
9013, 9029, 9041, 9043, 9049, 9059, 9067, 9091,
9103, 9109, 9127, 9133, 9137, 9151, 9157, 9161,
9173, 9181, 9187, 9199, 9203, 9209, 9221, 9227,
9239, 9241, 9257, 9277, 9281, 9283, 9293, 9311,
9319, 9323, 9337, 9341, 9343, 9349, 9371, 9377,
9391, 9397, 9403, 9413, 9419, 9421, 9431, 9433,
9437, 9439, 9461, 9463, 9467, 9473, 9479, 9491,
9497, 9511, 9521, 9533, 9539, 9547, 9551, 9587,
9601, 9613, 9619, 9623, 9629, 9631, 9643, 9649,
9661, 9677, 9679, 9689, 9697, 9719, 9721, 9733,
9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791,
9803, 9811, 9817, 9829, 9833, 9839, 9851, 9857,
9859, 9871, 9883, 9887, 9901, 9907, 9923, 9929,
9931, 9941, 9949, 9967, 9973, 10007, 10009, 10037,
10039, 10061, 10067, 10069, 10079, 10091, 10093, 10099,
10103, 10111, 10133, 10139, 10141, 10151, 10159, 10163,
10169, 10177, 10181, 10193, 10211, 10223, 10243, 10247,
10253, 10259, 10267, 10271, 10273, 10289, 10301, 10303,
10313, 10321, 10331, 10333, 10337, 10343, 10357, 10369,
10391, 10399, 10427, 10429, 10433, 10453, 10457, 10459,
10463, 10477, 10487, 10499, 10501, 10513, 10529, 10531,
10559, 10567, 10589, 10597, 10601, 10607, 10613, 10627,
10631, 10639, 10651, 10657, 10663, 10667, 10687, 10691,
10709, 10711, 10723, 10729, 10733, 10739, 10753, 10771,
10781, 10789, 10799, 10831, 10837, 10847, 10853, 10859,
10861, 10867, 10883, 10889, 10891, 10903, 10909, 10937,
10939, 10949, 10957, 10973, 10979, 10987, 10993, 11003,
11027, 11047, 11057, 11059, 11069, 11071, 11083, 11087,
11093, 11113, 11117, 11119, 11131, 11149, 11159, 11161,
11171, 11173, 11177, 11197, 11213, 11239, 11243, 11251,
11257, 11261, 11273, 11279, 11287, 11299, 11311, 11317,
11321, 11329, 11351, 11353, 11369, 11383, 11393, 11399,
11411, 11423, 11437, 11443, 11447, 11467, 11471, 11483,
11489, 11491, 11497, 11503, 11519, 11527, 11549, 11551,
11579, 11587, 11593, 11597, 11617, 11621, 11633, 11657,
11677, 11681, 11689, 11699, 11701, 11717, 11719, 11731,
11743, 11777, 11779, 11783, 11789, 11801, 11807, 11813,
11821, 11827, 11831, 11833, 11839, 11863, 11867, 11887,
11897, 11903, 11909, 11923, 11927, 11933, 11939, 11941,
11953, 11959, 11969, 11971, 11981, 11987, 12007, 12011,
12037, 12041, 12043, 12049, 12071, 12073, 12097, 12101,
12107, 12109, 12113, 12119, 12143, 12149, 12157, 12161,
12163, 12197, 12203, 12211, 12227, 12239, 12241, 12251,
12253, 12263, 12269, 12277, 12281, 12289, 12301, 12323,
12329, 12343, 12347, 12373, 12377, 12379, 12391, 12401,
12409, 12413, 12421, 12433, 12437, 12451, 12457, 12473,
12479, 12487, 12491, 12497, 12503, 12511, 12517, 12527,
12539, 12541, 12547, 12553, 12569, 12577, 12583, 12589,
12601, 12611, 12613, 12619, 12637, 12641, 12647, 12653,
12659, 12671, 12689, 12697, 12703, 12713, 12721, 12739,
12743, 12757, 12763, 12781, 12791, 12799, 12809, 12821,
12823, 12829, 12841, 12853, 12889, 12893, 12899, 12907,
12911, 12917, 12919, 12923, 12941, 12953, 12959, 12967,
12973, 12979, 12983, 13001, 13003, 13007, 13009, 13033,
13037, 13043, 13049, 13063, 13093, 13099, 13103, 13109,
13121, 13127, 13147, 13151, 13159, 13163, 13171, 13177,
13183, 13187, 13217, 13219, 13229, 13241, 13249, 13259,
13267, 13291, 13297, 13309, 13313, 13327, 13331, 13337,
13339, 13367, 13381, 13397, 13399, 13411, 13417, 13421,
13441, 13451, 13457, 13463, 13469, 13477, 13487, 13499,
13513, 13523, 13537, 13553, 13567, 13577, 13591, 13597,
13613, 13619, 13627, 13633, 13649, 13669, 13679, 13681,
13687, 13691, 13693, 13697, 13709, 13711, 13721, 13723,
13729, 13751, 13757, 13759, 13763, 13781, 13789, 13799,
13807, 13829, 13831, 13841, 13859, 13873, 13877, 13879,
13883, 13901, 13903, 13907, 13913, 13921, 13931, 13933,
13963, 13967, 13997, 13999, 14009, 14011, 14029, 14033,
14051, 14057, 14071, 14081, 14083, 14087, 14107, 14143,
14149, 14153, 14159, 14173, 14177, 14197, 14207, 14221,
14243, 14249, 14251, 14281, 14293, 14303, 14321, 14323,
14327, 14341, 14347, 14369, 14387, 14389, 14401, 14407,
14411, 14419, 14423, 14431, 14437, 14447, 14449, 14461,
14479, 14489, 14503, 14519, 14533, 14537, 14543, 14549,
14551, 14557, 14561, 14563, 14591, 14593, 14621, 14627,
14629, 14633, 14639, 14653, 14657, 14669, 14683, 14699,
14713, 14717, 14723, 14731, 14737, 14741, 14747, 14753,
14759, 14767, 14771, 14779, 14783, 14797, 14813, 14821,
14827, 14831, 14843, 14851, 14867, 14869, 14879, 14887,
14891, 14897, 14923, 14929, 14939, 14947, 14951, 14957,
14969, 14983, 15013, 15017, 15031, 15053, 15061, 15073,
15077, 15083, 15091, 15101, 15107, 15121, 15131, 15137,
15139, 15149, 15161, 15173, 15187, 15193, 15199, 15217,
15227, 15233, 15241, 15259, 15263, 15269, 15271, 15277,
15287, 15289, 15299, 15307, 15313, 15319, 15329, 15331,
15349, 15359, 15361, 15373, 15377, 15383, 15391, 15401,
15413, 15427, 15439, 15443, 15451, 15461, 15467, 15473,
15493, 15497, 15511, 15527, 15541, 15551, 15559, 15569,
15581, 15583, 15601, 15607, 15619, 15629, 15641, 15643,
15647, 15649, 15661, 15667, 15671, 15679, 15683, 15727,
15731, 15733, 15737, 15739, 15749, 15761, 15767, 15773,
15787, 15791, 15797, 15803, 15809, 15817, 15823, 15859,
15877, 15881, 15887, 15889, 15901, 15907, 15913, 15919,
15923, 15937, 15959, 15971, 15973, 15991, 16001, 16007,
16033, 16057, 16061, 16063, 16067, 16069, 16073, 16087,
16091, 16097, 16103, 16111, 16127, 16139, 16141, 16183,
16187, 16189, 16193, 16217, 16223, 16229, 16231, 16249,
16253, 16267, 16273, 16301, 16319, 16333, 16339, 16349,
16361, 16363, 16369, 16381, 16411, 16417, 16421, 16427,
16433, 16447, 16451, 16453, 16477, 16481, 16487, 16493,
16519, 16529, 16547, 16553, 16561, 16567, 16573, 16603,
16607, 16619, 16631, 16633, 16649, 16651, 16657, 16661,
16673, 16691, 16693, 16699, 16703, 16729, 16741, 16747,
16759, 16763, 16787, 16811, 16823, 16829, 16831, 16843,
16871, 16879, 16883, 16889, 16901, 16903, 16921, 16927,
16931, 16937, 16943, 16963, 16979, 16981, 16987, 16993,
17011, 17021, 17027, 17029, 17033, 17041, 17047, 17053,
17077, 17093, 17099, 17107, 17117, 17123, 17137, 17159,
17167, 17183, 17189, 17191, 17203, 17207, 17209, 17231,
17239, 17257, 17291, 17293, 17299, 17317, 17321, 17327,
17333, 17341, 17351, 17359, 17377, 17383, 17387, 17389,
17393, 17401, 17417, 17419, 17431, 17443, 17449, 17467,
17471, 17477, 17483, 17489, 17491, 17497, 17509, 17519,
17539, 17551, 17569, 17573, 17579, 17581, 17597, 17599,
17609, 17623, 17627, 17657, 17659, 17669, 17681, 17683,
17707, 17713, 17729, 17737, 17747, 17749, 17761, 17783,
17789, 17791, 17807, 17827, 17837, 17839, 17851, 17863,
#endif
};

Binary file not shown.

View File

@@ -0,0 +1,119 @@
#!/usr/local/bin/perl
# bn_prime.pl
$num=2048;
$num=$ARGV[0] if ($#ARGV >= 0);
push(@primes,2);
$p=1;
loop: while ($#primes < $num-1)
{
$p+=2;
$s=int(sqrt($p));
for ($i=0; defined($primes[$i]) && $primes[$i]<=$s; $i++)
{
next loop if (($p%$primes[$i]) == 0);
}
push(@primes,$p);
}
# print <<"EOF";
# /* Auto generated by bn_prime.pl */
# /* Copyright (C) 1995-1997 Eric Young (eay\@mincom.oz.au).
# * All rights reserved.
# * Copyright remains Eric Young's, and as such any Copyright notices in
# * the code are not to be removed.
# * See the COPYRIGHT file in the SSLeay distribution for more details.
# */
#
# EOF
print <<\EOF;
/* Auto generated by bn_prime.pl */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
EOF
for ($i=0; $i <= $#primes; $i++)
{
if ($primes[$i] > 256)
{
$eight=$i;
last;
}
}
printf "#ifndef EIGHT_BIT\n";
printf "#define NUMPRIMES %d\n",$num;
printf "typedef unsigned short prime_t;\n";
printf "#else\n";
printf "#define NUMPRIMES %d\n",$eight;
printf "typedef unsigned char prime_t;\n";
printf "#endif\n";
print "static const prime_t primes[NUMPRIMES]=\n\t{\n\t";
$init=0;
for ($i=0; $i <= $#primes; $i++)
{
printf "\n#ifndef EIGHT_BIT\n\t" if ($primes[$i] > 256) && !($init++);
printf("\n\t") if (($i%8) == 0) && ($i != 0);
printf("%4d,",$primes[$i]);
}
print "\n#endif\n\t};\n";

View File

@@ -0,0 +1,388 @@
/* crypto/bn/bn_print.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include <ctype.h>
#include "cryptlib.h"
#include <openssl/buffer.h>
#include "bn_lcl.h"
static const char Hex[] = "0123456789ABCDEF";
/* Must 'OPENSSL_free' the returned data */
char *BN_bn2hex(const BIGNUM *a)
{
int i, j, v, z = 0;
char *buf;
char *p;
if (a->neg && BN_is_zero(a)) {
/* "-0" == 3 bytes including NULL terminator */
buf = OPENSSL_malloc(3);
} else {
buf = OPENSSL_malloc(a->top * BN_BYTES * 2 + 2);
}
if (buf == NULL) {
BNerr(BN_F_BN_BN2HEX, ERR_R_MALLOC_FAILURE);
goto err;
}
p = buf;
if (a->neg)
*(p++) = '-';
if (BN_is_zero(a))
*(p++) = '0';
for (i = a->top - 1; i >= 0; i--) {
for (j = BN_BITS2 - 8; j >= 0; j -= 8) {
/* strip leading zeros */
v = ((int)(a->d[i] >> (long)j)) & 0xff;
if (z || (v != 0)) {
*(p++) = Hex[v >> 4];
*(p++) = Hex[v & 0x0f];
z = 1;
}
}
}
*p = '\0';
err:
return (buf);
}
/* Must 'OPENSSL_free' the returned data */
char *BN_bn2dec(const BIGNUM *a)
{
int i = 0, num, ok = 0;
char *buf = NULL;
char *p;
BIGNUM *t = NULL;
BN_ULONG *bn_data = NULL, *lp;
/*-
* get an upper bound for the length of the decimal integer
* num <= (BN_num_bits(a) + 1) * log(2)
* <= 3 * BN_num_bits(a) * 0.1001 + log(2) + 1 (rounding error)
* <= BN_num_bits(a)/10 + BN_num_bits/1000 + 1 + 1
*/
i = BN_num_bits(a) * 3;
num = (i / 10 + i / 1000 + 1) + 1;
bn_data =
(BN_ULONG *)OPENSSL_malloc((num / BN_DEC_NUM + 1) * sizeof(BN_ULONG));
buf = (char *)OPENSSL_malloc(num + 3);
if ((buf == NULL) || (bn_data == NULL)) {
BNerr(BN_F_BN_BN2DEC, ERR_R_MALLOC_FAILURE);
goto err;
}
if ((t = BN_dup(a)) == NULL)
goto err;
#define BUF_REMAIN (num+3 - (size_t)(p - buf))
p = buf;
lp = bn_data;
if (BN_is_zero(t)) {
*(p++) = '0';
*(p++) = '\0';
} else {
if (BN_is_negative(t))
*p++ = '-';
i = 0;
while (!BN_is_zero(t)) {
*lp = BN_div_word(t, BN_DEC_CONV);
lp++;
}
lp--;
/*
* We now have a series of blocks, BN_DEC_NUM chars in length, where
* the last one needs truncation. The blocks need to be reversed in
* order.
*/
BIO_snprintf(p, BUF_REMAIN, BN_DEC_FMT1, *lp);
while (*p)
p++;
while (lp != bn_data) {
lp--;
BIO_snprintf(p, BUF_REMAIN, BN_DEC_FMT2, *lp);
while (*p)
p++;
}
}
ok = 1;
err:
if (bn_data != NULL)
OPENSSL_free(bn_data);
if (t != NULL)
BN_free(t);
if (!ok && buf) {
OPENSSL_free(buf);
buf = NULL;
}
return (buf);
}
int BN_hex2bn(BIGNUM **bn, const char *a)
{
BIGNUM *ret = NULL;
BN_ULONG l = 0;
int neg = 0, h, m, i, j, k, c;
int num;
if ((a == NULL) || (*a == '\0'))
return (0);
if (*a == '-') {
neg = 1;
a++;
}
for (i = 0; isxdigit((unsigned char)a[i]); i++) ;
num = i + neg;
if (bn == NULL)
return (num);
/* a is the start of the hex digits, and it is 'i' long */
if (*bn == NULL) {
if ((ret = BN_new()) == NULL)
return (0);
} else {
ret = *bn;
BN_zero(ret);
}
/* i is the number of hex digests; */
if (bn_expand(ret, i * 4) == NULL)
goto err;
j = i; /* least significant 'hex' */
m = 0;
h = 0;
while (j > 0) {
m = ((BN_BYTES * 2) <= j) ? (BN_BYTES * 2) : j;
l = 0;
for (;;) {
c = a[j - m];
if ((c >= '0') && (c <= '9'))
k = c - '0';
else if ((c >= 'a') && (c <= 'f'))
k = c - 'a' + 10;
else if ((c >= 'A') && (c <= 'F'))
k = c - 'A' + 10;
else
k = 0; /* paranoia */
l = (l << 4) | k;
if (--m <= 0) {
ret->d[h++] = l;
break;
}
}
j -= (BN_BYTES * 2);
}
ret->top = h;
bn_correct_top(ret);
ret->neg = neg;
*bn = ret;
bn_check_top(ret);
return (num);
err:
if (*bn == NULL)
BN_free(ret);
return (0);
}
int BN_dec2bn(BIGNUM **bn, const char *a)
{
BIGNUM *ret = NULL;
BN_ULONG l = 0;
int neg = 0, i, j;
int num;
if ((a == NULL) || (*a == '\0'))
return (0);
if (*a == '-') {
neg = 1;
a++;
}
for (i = 0; isdigit((unsigned char)a[i]); i++) ;
num = i + neg;
if (bn == NULL)
return (num);
/*
* a is the start of the digits, and it is 'i' long. We chop it into
* BN_DEC_NUM digits at a time
*/
if (*bn == NULL) {
if ((ret = BN_new()) == NULL)
return (0);
} else {
ret = *bn;
BN_zero(ret);
}
/* i is the number of digests, a bit of an over expand; */
if (bn_expand(ret, i * 4) == NULL)
goto err;
j = BN_DEC_NUM - (i % BN_DEC_NUM);
if (j == BN_DEC_NUM)
j = 0;
l = 0;
while (*a) {
l *= 10;
l += *a - '0';
a++;
if (++j == BN_DEC_NUM) {
BN_mul_word(ret, BN_DEC_CONV);
BN_add_word(ret, l);
l = 0;
j = 0;
}
}
ret->neg = neg;
bn_correct_top(ret);
*bn = ret;
bn_check_top(ret);
return (num);
err:
if (*bn == NULL)
BN_free(ret);
return (0);
}
int BN_asc2bn(BIGNUM **bn, const char *a)
{
const char *p = a;
if (*p == '-')
p++;
if (p[0] == '0' && (p[1] == 'X' || p[1] == 'x')) {
if (!BN_hex2bn(bn, p + 2))
return 0;
} else {
if (!BN_dec2bn(bn, p))
return 0;
}
if (*a == '-')
(*bn)->neg = 1;
return 1;
}
#ifndef OPENSSL_NO_BIO
# ifndef OPENSSL_NO_FP_API
int BN_print_fp(FILE *fp, const BIGNUM *a)
{
BIO *b;
int ret;
if ((b = BIO_new(BIO_s_file())) == NULL)
return (0);
BIO_set_fp(b, fp, BIO_NOCLOSE);
ret = BN_print(b, a);
BIO_free(b);
return (ret);
}
# endif
int BN_print(BIO *bp, const BIGNUM *a)
{
int i, j, v, z = 0;
int ret = 0;
if ((a->neg) && (BIO_write(bp, "-", 1) != 1))
goto end;
if (BN_is_zero(a) && (BIO_write(bp, "0", 1) != 1))
goto end;
for (i = a->top - 1; i >= 0; i--) {
for (j = BN_BITS2 - 4; j >= 0; j -= 4) {
/* strip leading zeros */
v = ((int)(a->d[i] >> (long)j)) & 0x0f;
if (z || (v != 0)) {
if (BIO_write(bp, &(Hex[v]), 1) != 1)
goto end;
z = 1;
}
}
}
ret = 1;
end:
return (ret);
}
#endif
char *BN_options(void)
{
static int init = 0;
static char data[16];
if (!init) {
init++;
#ifdef BN_LLONG
BIO_snprintf(data, sizeof data, "bn(%d,%d)",
(int)sizeof(BN_ULLONG) * 8, (int)sizeof(BN_ULONG) * 8);
#else
BIO_snprintf(data, sizeof data, "bn(%d,%d)",
(int)sizeof(BN_ULONG) * 8, (int)sizeof(BN_ULONG) * 8);
#endif
}
return (data);
}

Binary file not shown.

View File

@@ -0,0 +1,295 @@
/* crypto/bn/bn_rand.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* ====================================================================
* Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
#include <stdio.h>
#include <time.h>
#include "cryptlib.h"
#include "bn_lcl.h"
#include <openssl/rand.h>
static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
{
unsigned char *buf = NULL;
int ret = 0, bit, bytes, mask;
time_t tim;
if (bits < 0 || (bits == 1 && top > 0)) {
BNerr(BN_F_BNRAND, BN_R_BITS_TOO_SMALL);
return 0;
}
if (bits == 0) {
BN_zero(rnd);
return 1;
}
bytes = (bits + 7) / 8;
bit = (bits - 1) % 8;
mask = 0xff << (bit + 1);
buf = (unsigned char *)OPENSSL_malloc(bytes);
if (buf == NULL) {
BNerr(BN_F_BNRAND, ERR_R_MALLOC_FAILURE);
goto err;
}
/* make a random number and set the top and bottom bits */
time(&tim);
RAND_add(&tim, sizeof(tim), 0.0);
if (pseudorand) {
if (RAND_pseudo_bytes(buf, bytes) == -1)
goto err;
} else {
if (RAND_bytes(buf, bytes) <= 0)
goto err;
}
#if 1
if (pseudorand == 2) {
/*
* generate patterns that are more likely to trigger BN library bugs
*/
int i;
unsigned char c;
for (i = 0; i < bytes; i++) {
if (RAND_pseudo_bytes(&c, 1) < 0)
goto err;
if (c >= 128 && i > 0)
buf[i] = buf[i - 1];
else if (c < 42)
buf[i] = 0;
else if (c < 84)
buf[i] = 255;
}
}
#endif
if (top >= 0) {
if (top) {
if (bit == 0) {
buf[0] = 1;
buf[1] |= 0x80;
} else {
buf[0] |= (3 << (bit - 1));
}
} else {
buf[0] |= (1 << bit);
}
}
buf[0] &= ~mask;
if (bottom) /* set bottom bit if requested */
buf[bytes - 1] |= 1;
if (!BN_bin2bn(buf, bytes, rnd))
goto err;
ret = 1;
err:
if (buf != NULL) {
OPENSSL_cleanse(buf, bytes);
OPENSSL_free(buf);
}
bn_check_top(rnd);
return (ret);
}
int BN_rand(BIGNUM *rnd, int bits, int top, int bottom)
{
return bnrand(0, rnd, bits, top, bottom);
}
int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom)
{
return bnrand(1, rnd, bits, top, bottom);
}
#if 1
int BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom)
{
return bnrand(2, rnd, bits, top, bottom);
}
#endif
/* random number r: 0 <= r < range */
static int bn_rand_range(int pseudo, BIGNUM *r, const BIGNUM *range)
{
int (*bn_rand) (BIGNUM *, int, int, int) =
pseudo ? BN_pseudo_rand : BN_rand;
int n;
int count = 100;
if (range->neg || BN_is_zero(range)) {
BNerr(BN_F_BN_RAND_RANGE, BN_R_INVALID_RANGE);
return 0;
}
n = BN_num_bits(range); /* n > 0 */
/* BN_is_bit_set(range, n - 1) always holds */
if (n == 1)
BN_zero(r);
else if (!BN_is_bit_set(range, n - 2) && !BN_is_bit_set(range, n - 3)) {
/*
* range = 100..._2, so 3*range (= 11..._2) is exactly one bit longer
* than range
*/
do {
if (!bn_rand(r, n + 1, -1, 0))
return 0;
/*
* If r < 3*range, use r := r MOD range (which is either r, r -
* range, or r - 2*range). Otherwise, iterate once more. Since
* 3*range = 11..._2, each iteration succeeds with probability >=
* .75.
*/
if (BN_cmp(r, range) >= 0) {
if (!BN_sub(r, r, range))
return 0;
if (BN_cmp(r, range) >= 0)
if (!BN_sub(r, r, range))
return 0;
}
if (!--count) {
BNerr(BN_F_BN_RAND_RANGE, BN_R_TOO_MANY_ITERATIONS);
return 0;
}
}
while (BN_cmp(r, range) >= 0);
} else {
do {
/* range = 11..._2 or range = 101..._2 */
if (!bn_rand(r, n, -1, 0))
return 0;
if (!--count) {
BNerr(BN_F_BN_RAND_RANGE, BN_R_TOO_MANY_ITERATIONS);
return 0;
}
}
while (BN_cmp(r, range) >= 0);
}
bn_check_top(r);
return 1;
}
int BN_rand_range(BIGNUM *r, const BIGNUM *range)
{
return bn_rand_range(0, r, range);
}
int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range)
{
return bn_rand_range(1, r, range);
}

Binary file not shown.

View File

@@ -0,0 +1,251 @@
/* crypto/bn/bn_recp.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include "cryptlib.h"
#include "bn_lcl.h"
void BN_RECP_CTX_init(BN_RECP_CTX *recp)
{
BN_init(&(recp->N));
BN_init(&(recp->Nr));
recp->num_bits = 0;
recp->flags = 0;
}
BN_RECP_CTX *BN_RECP_CTX_new(void)
{
BN_RECP_CTX *ret;
if ((ret = (BN_RECP_CTX *)OPENSSL_malloc(sizeof(BN_RECP_CTX))) == NULL)
return (NULL);
BN_RECP_CTX_init(ret);
ret->flags = BN_FLG_MALLOCED;
return (ret);
}
void BN_RECP_CTX_free(BN_RECP_CTX *recp)
{
if (recp == NULL)
return;
BN_free(&(recp->N));
BN_free(&(recp->Nr));
if (recp->flags & BN_FLG_MALLOCED)
OPENSSL_free(recp);
}
int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx)
{
if (!BN_copy(&(recp->N), d))
return 0;
BN_zero(&(recp->Nr));
recp->num_bits = BN_num_bits(d);
recp->shift = 0;
return (1);
}
int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
BN_RECP_CTX *recp, BN_CTX *ctx)
{
int ret = 0;
BIGNUM *a;
const BIGNUM *ca;
BN_CTX_start(ctx);
if ((a = BN_CTX_get(ctx)) == NULL)
goto err;
if (y != NULL) {
if (x == y) {
if (!BN_sqr(a, x, ctx))
goto err;
} else {
if (!BN_mul(a, x, y, ctx))
goto err;
}
ca = a;
} else
ca = x; /* Just do the mod */
ret = BN_div_recp(NULL, r, ca, recp, ctx);
err:
BN_CTX_end(ctx);
bn_check_top(r);
return (ret);
}
int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
BN_RECP_CTX *recp, BN_CTX *ctx)
{
int i, j, ret = 0;
BIGNUM *a, *b, *d, *r;
BN_CTX_start(ctx);
a = BN_CTX_get(ctx);
b = BN_CTX_get(ctx);
if (dv != NULL)
d = dv;
else
d = BN_CTX_get(ctx);
if (rem != NULL)
r = rem;
else
r = BN_CTX_get(ctx);
if (a == NULL || b == NULL || d == NULL || r == NULL)
goto err;
if (BN_ucmp(m, &(recp->N)) < 0) {
BN_zero(d);
if (!BN_copy(r, m)) {
BN_CTX_end(ctx);
return 0;
}
BN_CTX_end(ctx);
return (1);
}
/*
* We want the remainder Given input of ABCDEF / ab we need multiply
* ABCDEF by 3 digests of the reciprocal of ab
*/
/* i := max(BN_num_bits(m), 2*BN_num_bits(N)) */
i = BN_num_bits(m);
j = recp->num_bits << 1;
if (j > i)
i = j;
/* Nr := round(2^i / N) */
if (i != recp->shift)
recp->shift = BN_reciprocal(&(recp->Nr), &(recp->N), i, ctx);
/* BN_reciprocal could have returned -1 for an error */
if (recp->shift == -1)
goto err;
/*-
* d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - BN_num_bits(N)))|
* = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - BN_num_bits(N)))|
* <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)|
* = |m/N|
*/
if (!BN_rshift(a, m, recp->num_bits))
goto err;
if (!BN_mul(b, a, &(recp->Nr), ctx))
goto err;
if (!BN_rshift(d, b, i - recp->num_bits))
goto err;
d->neg = 0;
if (!BN_mul(b, &(recp->N), d, ctx))
goto err;
if (!BN_usub(r, m, b))
goto err;
r->neg = 0;
#if 1
j = 0;
while (BN_ucmp(r, &(recp->N)) >= 0) {
if (j++ > 2) {
BNerr(BN_F_BN_DIV_RECP, BN_R_BAD_RECIPROCAL);
goto err;
}
if (!BN_usub(r, r, &(recp->N)))
goto err;
if (!BN_add_word(d, 1))
goto err;
}
#endif
r->neg = BN_is_zero(r) ? 0 : m->neg;
d->neg = m->neg ^ recp->N.neg;
ret = 1;
err:
BN_CTX_end(ctx);
bn_check_top(dv);
bn_check_top(rem);
return (ret);
}
/*
* len is the expected size of the result We actually calculate with an extra
* word of precision, so we can do faster division if the remainder is not
* required.
*/
/* r := 2^len / m */
int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx)
{
int ret = -1;
BIGNUM *t;
BN_CTX_start(ctx);
if ((t = BN_CTX_get(ctx)) == NULL)
goto err;
if (!BN_set_bit(t, len))
goto err;
if (!BN_div(r, NULL, t, m, ctx))
goto err;
ret = len;
err:
bn_check_top(r);
BN_CTX_end(ctx);
return (ret);
}

Binary file not shown.

View File

@@ -0,0 +1,224 @@
/* crypto/bn/bn_shift.c */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
#include <stdio.h>
#include "cryptlib.h"
#include "bn_lcl.h"
int BN_lshift1(BIGNUM *r, const BIGNUM *a)
{
register BN_ULONG *ap, *rp, t, c;
int i;
bn_check_top(r);
bn_check_top(a);
if (r != a) {
r->neg = a->neg;
if (bn_wexpand(r, a->top + 1) == NULL)
return (0);
r->top = a->top;
} else {
if (bn_wexpand(r, a->top + 1) == NULL)
return (0);
}
ap = a->d;
rp = r->d;
c = 0;
for (i = 0; i < a->top; i++) {
t = *(ap++);
*(rp++) = ((t << 1) | c) & BN_MASK2;
c = (t & BN_TBIT) ? 1 : 0;
}
if (c) {
*rp = 1;
r->top++;
}
bn_check_top(r);
return (1);
}
int BN_rshift1(BIGNUM *r, const BIGNUM *a)
{
BN_ULONG *ap, *rp, t, c;
int i, j;
bn_check_top(r);
bn_check_top(a);
if (BN_is_zero(a)) {
BN_zero(r);
return (1);
}
i = a->top;
ap = a->d;
j = i - (ap[i - 1] == 1);
if (a != r) {
if (bn_wexpand(r, j) == NULL)
return (0);
r->neg = a->neg;
}
rp = r->d;
t = ap[--i];
c = (t & 1) ? BN_TBIT : 0;
if (t >>= 1)
rp[i] = t;
while (i > 0) {
t = ap[--i];
rp[i] = ((t >> 1) & BN_MASK2) | c;
c = (t & 1) ? BN_TBIT : 0;
}
r->top = j;
bn_check_top(r);
return (1);
}
int BN_lshift(BIGNUM *r, const BIGNUM *a, int n)
{
int i, nw, lb, rb;
BN_ULONG *t, *f;
BN_ULONG l;
bn_check_top(r);
bn_check_top(a);
if (n < 0) {
BNerr(BN_F_BN_LSHIFT, BN_R_INVALID_SHIFT);
return 0;
}
r->neg = a->neg;
nw = n / BN_BITS2;
if (bn_wexpand(r, a->top + nw + 1) == NULL)
return (0);
lb = n % BN_BITS2;
rb = BN_BITS2 - lb;
f = a->d;
t = r->d;
t[a->top + nw] = 0;
if (lb == 0)
for (i = a->top - 1; i >= 0; i--)
t[nw + i] = f[i];
else
for (i = a->top - 1; i >= 0; i--) {
l = f[i];
t[nw + i + 1] |= (l >> rb) & BN_MASK2;
t[nw + i] = (l << lb) & BN_MASK2;
}
memset(t, 0, nw * sizeof(t[0]));
/*
* for (i=0; i<nw; i++) t[i]=0;
*/
r->top = a->top + nw + 1;
bn_correct_top(r);
bn_check_top(r);
return (1);
}
int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
{
int i, j, nw, lb, rb;
BN_ULONG *t, *f;
BN_ULONG l, tmp;
bn_check_top(r);
bn_check_top(a);
if (n < 0) {
BNerr(BN_F_BN_RSHIFT, BN_R_INVALID_SHIFT);
return 0;
}
nw = n / BN_BITS2;
rb = n % BN_BITS2;
lb = BN_BITS2 - rb;
if (nw >= a->top || a->top == 0) {
BN_zero(r);
return (1);
}
i = (BN_num_bits(a) - n + (BN_BITS2 - 1)) / BN_BITS2;
if (r != a) {
r->neg = a->neg;
if (bn_wexpand(r, i) == NULL)
return (0);
} else {
if (n == 0)
return 1; /* or the copying loop will go berserk */
}
f = &(a->d[nw]);
t = r->d;
j = a->top - nw;
r->top = i;
if (rb == 0) {
for (i = j; i != 0; i--)
*(t++) = *(f++);
} else {
l = *(f++);
for (i = j - 1; i != 0; i--) {
tmp = (l >> rb) & BN_MASK2;
l = *(f++);
*(t++) = (tmp | (l << lb)) & BN_MASK2;
}
if ((l = (l >> rb) & BN_MASK2))
*(t) = l;
}
bn_check_top(r);
return (1);
}

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More