Initial Commit

2017-02-25 23:55:24 +01:00
commit 1fe2e8ab62
4868 changed files with 1487355 additions and 0 deletions
--- a/openssl-1.0.2f/crypto/bn/Makefile
+++ b/openssl-1.0.2f/crypto/bn/Makefile
@@ -0,0 +1,389 @@
+#
+# OpenSSL/crypto/bn/Makefile
+#
+
+DIR=	bn
+TOP=	../..
+CC=	cc
+CPP=    $(CC) -E
+INCLUDES= -I.. -I$(TOP) -I../../include
+CFLAG=-g
+MAKEFILE=	Makefile
+AR=		ar r
+
+BN_ASM=		bn_asm.o
+
+CFLAGS= $(INCLUDES) $(CFLAG)
+ASFLAGS= $(INCLUDES) $(ASFLAG)
+AFLAGS= $(ASFLAGS)
+
+GENERAL=Makefile
+TEST=bntest.c exptest.c
+APPS=
+
+LIB=$(TOP)/libcrypto.a
+LIBSRC=	bn_add.c bn_div.c bn_exp.c bn_lib.c bn_ctx.c bn_mul.c bn_mod.c \
+	bn_print.c bn_rand.c bn_shift.c bn_word.c bn_blind.c \
+	bn_kron.c bn_sqrt.c bn_gcd.c bn_prime.c bn_err.c bn_sqr.c bn_asm.c \
+	bn_recp.c bn_mont.c bn_mpi.c bn_exp2.c bn_gf2m.c bn_nist.c \
+	bn_depr.c bn_const.c bn_x931p.c
+
+LIBOBJ=	bn_add.o bn_div.o bn_exp.o bn_lib.o bn_ctx.o bn_mul.o bn_mod.o \
+	bn_print.o bn_rand.o bn_shift.o bn_word.o bn_blind.o \
+	bn_kron.o bn_sqrt.o bn_gcd.o bn_prime.o bn_err.o bn_sqr.o $(BN_ASM) \
+	bn_recp.o bn_mont.o bn_mpi.o bn_exp2.o bn_gf2m.o bn_nist.o \
+	bn_depr.o bn_const.o bn_x931p.o
+
+SRC= $(LIBSRC)
+
+EXHEADER= bn.h
+HEADER=	bn_lcl.h bn_prime.h $(EXHEADER)
+
+ALL=    $(GENERAL) $(SRC) $(HEADER)
+
+top:
+	(cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
+
+all:	lib
+
+bn_prime.h: bn_prime.pl
+	$(PERL) bn_prime.pl >bn_prime.h
+
+divtest: divtest.c ../../libcrypto.a
+	cc -I../../include divtest.c -o divtest ../../libcrypto.a
+
+bnbug: bnbug.c ../../libcrypto.a top
+	cc -g -I../../include bnbug.c -o bnbug ../../libcrypto.a
+
+lib:	$(LIBOBJ)
+	$(AR) $(LIB) $(LIBOBJ)
+	$(RANLIB) $(LIB) || echo Never mind.
+	@touch lib
+
+bn-586.s:	asm/bn-586.pl ../perlasm/x86asm.pl
+	$(PERL) asm/bn-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
+co-586.s:	asm/co-586.pl ../perlasm/x86asm.pl
+	$(PERL) asm/co-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
+x86-mont.s:	asm/x86-mont.pl ../perlasm/x86asm.pl
+	$(PERL) asm/x86-mont.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
+x86-gf2m.s:	asm/x86-gf2m.pl ../perlasm/x86asm.pl
+	$(PERL) asm/x86-gf2m.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
+
+sparcv8.o:	asm/sparcv8.S
+	$(CC) $(CFLAGS) -c asm/sparcv8.S
+bn-sparcv9.o:	asm/sparcv8plus.S
+	$(CC) $(CFLAGS) -c -o $@ asm/sparcv8plus.S
+sparcv9a-mont.s:	asm/sparcv9a-mont.pl
+	$(PERL) asm/sparcv9a-mont.pl $(CFLAGS) > $@
+sparcv9-mont.s:		asm/sparcv9-mont.pl
+	$(PERL) asm/sparcv9-mont.pl $(CFLAGS) > $@
+vis3-mont.s:		asm/vis3-mont.pl
+	$(PERL) asm/vis3-mont.pl $(CFLAGS) > $@
+sparct4-mont.S:	asm/sparct4-mont.pl
+	$(PERL) asm/sparct4-mont.pl $(CFLAGS) > $@
+sparcv9-gf2m.S:	asm/sparcv9-gf2m.pl
+	$(PERL) asm/sparcv9-gf2m.pl $(CFLAGS) > $@
+
+bn-mips3.o:	asm/mips3.s
+	@if [ "$(CC)" = "gcc" ]; then \
+		ABI=`expr "$(CFLAGS)" : ".*-mabi=\([n3264]*\)"` && \
+		as -$$ABI -O -o $@ asm/mips3.s; \
+	else	$(CC) -c $(CFLAGS) -o $@ asm/mips3.s; fi
+
+bn-mips.s:	asm/mips.pl
+	$(PERL) asm/mips.pl $(PERLASM_SCHEME) $@
+mips-mont.s:	asm/mips-mont.pl
+	$(PERL)	asm/mips-mont.pl $(PERLASM_SCHEME) $@
+
+bn-s390x.o:	asm/s390x.S
+	$(CC) $(CFLAGS) -c -o $@ asm/s390x.S
+s390x-gf2m.s:	asm/s390x-gf2m.pl
+	$(PERL) asm/s390x-gf2m.pl $(PERLASM_SCHEME) $@
+
+x86_64-gcc.o:	asm/x86_64-gcc.c
+	$(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c
+x86_64-mont.s:	asm/x86_64-mont.pl
+	$(PERL) asm/x86_64-mont.pl $(PERLASM_SCHEME) > $@
+x86_64-mont5.s:	asm/x86_64-mont5.pl
+	$(PERL) asm/x86_64-mont5.pl $(PERLASM_SCHEME) > $@
+x86_64-gf2m.s:	asm/x86_64-gf2m.pl
+	$(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@
+rsaz-x86_64.s:	asm/rsaz-x86_64.pl
+	$(PERL) asm/rsaz-x86_64.pl $(PERLASM_SCHEME) > $@
+rsaz-avx2.s:	asm/rsaz-avx2.pl 
+	$(PERL) asm/rsaz-avx2.pl $(PERLASM_SCHEME) > $@
+
+bn-ia64.s:	asm/ia64.S
+	$(CC) $(CFLAGS) -E asm/ia64.S > $@
+ia64-mont.s:	asm/ia64-mont.pl
+	$(PERL) asm/ia64-mont.pl $@ $(CFLAGS)
+
+# GNU assembler fails to compile PA-RISC2 modules, insist on calling
+# vendor assembler...
+pa-risc2W.o: asm/pa-risc2W.s
+	/usr/ccs/bin/as -o pa-risc2W.o asm/pa-risc2W.s
+pa-risc2.o: asm/pa-risc2.s
+	/usr/ccs/bin/as -o pa-risc2.o asm/pa-risc2.s
+parisc-mont.s:	asm/parisc-mont.pl
+	$(PERL) asm/parisc-mont.pl $(PERLASM_SCHEME) $@
+
+# ppc - AIX, Linux, MacOS X...
+bn-ppc.s:	asm/ppc.pl;	$(PERL) asm/ppc.pl $(PERLASM_SCHEME) $@
+ppc-mont.s:	asm/ppc-mont.pl;$(PERL) asm/ppc-mont.pl $(PERLASM_SCHEME) $@
+ppc64-mont.s:	asm/ppc64-mont.pl;$(PERL) asm/ppc64-mont.pl $(PERLASM_SCHEME) $@
+
+alpha-mont.s:	asm/alpha-mont.pl
+	(preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
+	$(PERL) asm/alpha-mont.pl > $$preproc && \
+	$(CC) -E -P $$preproc > $@ && rm $$preproc)
+
+# GNU make "catch all"
+%-mont.S:	asm/%-mont.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
+%-gf2m.S:	asm/%-gf2m.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
+
+armv4-mont.o:	armv4-mont.S
+armv4-gf2m.o:	armv4-gf2m.S
+
+files:
+	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
+
+links:
+	@$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
+	@$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
+	@$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
+
+install:
+	@[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
+	@headerlist="$(EXHEADER)"; for i in $$headerlist ; \
+	do  \
+	(cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
+	chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
+	done;
+
+exptest:
+	rm -f exptest
+	gcc -I../../include -g2 -ggdb -o exptest exptest.c ../../libcrypto.a
+
+div:
+	rm -f a.out
+	gcc -I.. -g div.c ../../libcrypto.a
+
+tags:
+	ctags $(SRC)
+
+tests:
+
+lint:
+	lint -DLINT $(INCLUDES) $(SRC)>fluff
+
+update: bn_prime.h depend
+
+depend:
+	@[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
+	$(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
+
+dclean:
+	$(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
+	mv -f Makefile.new $(MAKEFILE)
+
+clean:
+	rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
+
+# DO NOT DELETE THIS LINE -- make depend depends on it.
+
+bn_add.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_add.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_add.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_add.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_add.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_add.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_add.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_add.c bn_lcl.h
+bn_asm.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_asm.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_asm.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_asm.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_asm.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_asm.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_asm.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_asm.c bn_lcl.h
+bn_blind.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_blind.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_blind.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_blind.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_blind.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_blind.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_blind.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_blind.c bn_lcl.h
+bn_const.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+bn_const.o: ../../include/openssl/opensslconf.h
+bn_const.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_const.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_const.o: ../../include/openssl/symhacks.h bn.h bn_const.c
+bn_ctx.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_ctx.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_ctx.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_ctx.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_ctx.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_ctx.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_ctx.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_ctx.c bn_lcl.h
+bn_depr.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_depr.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_depr.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_depr.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_depr.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_depr.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
+bn_depr.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
+bn_depr.o: ../cryptlib.h bn_depr.c bn_lcl.h
+bn_div.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_div.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_div.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_div.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_div.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_div.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_div.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_div.c bn_lcl.h
+bn_err.o: ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_err.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+bn_err.o: ../../include/openssl/err.h ../../include/openssl/lhash.h
+bn_err.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
+bn_err.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
+bn_err.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
+bn_err.o: bn_err.c
+bn_exp.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_exp.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_exp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h
+bn_exp.o: rsaz_exp.h
+bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_exp2.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_exp2.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_exp2.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_exp2.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp2.c bn_lcl.h
+bn_gcd.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_gcd.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_gcd.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_gcd.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_gcd.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_gcd.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_gcd.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_gcd.c bn_lcl.h
+bn_gf2m.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_gf2m.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_gf2m.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_gf2m.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_gf2m.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_gf2m.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_gf2m.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_gf2m.c bn_lcl.h
+bn_kron.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_kron.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_kron.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_kron.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_kron.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_kron.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_kron.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_kron.c bn_lcl.h
+bn_lib.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_lib.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_lib.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_lib.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_lib.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_lib.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_lib.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_lib.c
+bn_mod.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_mod.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_mod.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_mod.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_mod.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_mod.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_mod.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mod.c
+bn_mont.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_mont.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_mont.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_mont.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_mont.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_mont.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_mont.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mont.c
+bn_mpi.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_mpi.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_mpi.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_mpi.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_mpi.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_mpi.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_mpi.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mpi.c
+bn_mul.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_mul.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_mul.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_mul.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_mul.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_mul.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_mul.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mul.c
+bn_nist.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_nist.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_nist.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_nist.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_nist.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_nist.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_nist.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_nist.c
+bn_prime.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_prime.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_prime.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_prime.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_prime.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_prime.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
+bn_prime.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
+bn_prime.o: ../cryptlib.h bn_lcl.h bn_prime.c bn_prime.h
+bn_print.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_print.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_print.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_print.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_print.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_print.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_print.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_print.c
+bn_rand.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_rand.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_rand.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_rand.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_rand.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_rand.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
+bn_rand.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
+bn_rand.o: ../cryptlib.h bn_lcl.h bn_rand.c
+bn_recp.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_recp.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_recp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_recp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_recp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_recp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_recp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_recp.c
+bn_shift.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_shift.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_shift.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_shift.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_shift.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_shift.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_shift.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_shift.c
+bn_sqr.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_sqr.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_sqr.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_sqr.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_sqr.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_sqr.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_sqr.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_sqr.c
+bn_sqrt.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_sqrt.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_sqrt.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_sqrt.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_sqrt.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_sqrt.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_sqrt.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_sqrt.c
+bn_word.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
+bn_word.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
+bn_word.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
+bn_word.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
+bn_word.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_word.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_word.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_word.c
+bn_x931p.o: ../../include/openssl/bn.h ../../include/openssl/crypto.h
+bn_x931p.o: ../../include/openssl/e_os2.h ../../include/openssl/opensslconf.h
+bn_x931p.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+bn_x931p.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+bn_x931p.o: ../../include/openssl/symhacks.h bn_x931p.c
--- a/openssl-1.0.2f/crypto/bn/asm/README
+++ b/openssl-1.0.2f/crypto/bn/asm/README
@@ -0,0 +1,27 @@
+<OBSOLETE>
+
+All assember in this directory are just version of the file
+crypto/bn/bn_asm.c.
+
+Quite a few of these files are just the assember output from gcc since on 
+quite a few machines they are 2 times faster than the system compiler.
+
+For the x86, I have hand written assember because of the bad job all
+compilers seem to do on it.  This normally gives a 2 time speed up in the RSA
+routines.
+
+For the DEC alpha, I also hand wrote the assember (except the division which
+is just the output from the C compiler pasted on the end of the file).
+On the 2 alpha C compilers I had access to, it was not possible to do
+64b x 64b -> 128b calculations (both long and the long long data types
+were 64 bits).  So the hand assember gives access to the 128 bit result and
+a 2 times speedup :-).
+
+There are 3 versions of assember for the HP PA-RISC.
+
+pa-risc.s is the origional one which works fine and generated using gcc :-)
+
+pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
+by Chris Ruemmler from HP (with some help from the HP C compiler).
+
+</OBSOLETE>
--- a/openssl-1.0.2f/crypto/bn/asm/alpha-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/alpha-mont.pl
@@ -0,0 +1,321 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# On 21264 RSA sign performance improves by 70/35/20/15 percent for
+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
+# instructed to '-tune host' code with in-line assembler. Other
+# benchmarks improve by 15-20%. To anchor it to something else, the
+# code provides approximately the same performance per GHz as AMD64.
+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
+# difference.
+
+# int bn_mul_mont(
+$rp="a0";	# BN_ULONG *rp,
+$ap="a1";	# const BN_ULONG *ap,
+$bp="a2";	# const BN_ULONG *bp,
+$np="a3";	# const BN_ULONG *np,
+$n0="a4";	# const BN_ULONG *n0,
+$num="a5";	# int num);
+
+$lo0="t0";
+$hi0="t1";
+$lo1="t2";
+$hi1="t3";
+$aj="t4";
+$bi="t5";
+$nj="t6";
+$tp="t7";
+$alo="t8";
+$ahi="t9";
+$nlo="t10";
+$nhi="t11";
+$tj="t12";
+$i="s3";
+$j="s4";
+$m1="s5";
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+
+.globl	bn_mul_mont
+.align	5
+.ent	bn_mul_mont
+bn_mul_mont:
+	lda	sp,-48(sp)
+	stq	ra,0(sp)
+	stq	s3,8(sp)
+	stq	s4,16(sp)
+	stq	s5,24(sp)
+	stq	fp,32(sp)
+	mov	sp,fp
+	.mask	0x0400f000,-48
+	.frame	fp,48,ra
+	.prologue 0
+
+	.align	4
+	.set	reorder
+	sextl	$num,$num
+	mov	0,v0
+	cmplt	$num,4,AT
+	bne	AT,.Lexit
+
+	ldq	$hi0,0($ap)	# ap[0]
+	s8addq	$num,16,AT
+	ldq	$aj,8($ap)
+	subq	sp,AT,sp
+	ldq	$bi,0($bp)	# bp[0]
+	lda	AT,-4096(zero)	# mov	-4096,AT
+	ldq	$n0,0($n0)
+	and	sp,AT,sp
+
+	mulq	$hi0,$bi,$lo0
+	ldq	$hi1,0($np)	# np[0]
+	umulh	$hi0,$bi,$hi0
+	ldq	$nj,8($np)
+
+	mulq	$lo0,$n0,$m1
+
+	mulq	$hi1,$m1,$lo1
+	umulh	$hi1,$m1,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,AT
+	addq	$hi1,AT,$hi1
+
+	mulq	$aj,$bi,$alo
+	mov	2,$j
+	umulh	$aj,$bi,$ahi
+	mov	sp,$tp
+
+	mulq	$nj,$m1,$nlo
+	s8addq	$j,$ap,$aj
+	umulh	$nj,$m1,$nhi
+	s8addq	$j,$np,$nj
+.align	4
+.L1st:
+	.set	noreorder
+	ldq	$aj,0($aj)
+	addl	$j,1,$j
+	ldq	$nj,0($nj)
+	lda	$tp,8($tp)
+
+	addq	$alo,$hi0,$lo0
+	mulq	$aj,$bi,$alo
+	cmpult	$lo0,$hi0,AT
+	addq	$nlo,$hi1,$lo1
+
+	mulq	$nj,$m1,$nlo
+	addq	$ahi,AT,$hi0
+	cmpult	$lo1,$hi1,v0
+	cmplt	$j,$num,$tj
+
+	umulh	$aj,$bi,$ahi
+	addq	$nhi,v0,$hi1
+	addq	$lo1,$lo0,$lo1
+	s8addq	$j,$ap,$aj
+
+	umulh	$nj,$m1,$nhi
+	cmpult	$lo1,$lo0,v0
+	addq	$hi1,v0,$hi1
+	s8addq	$j,$np,$nj
+
+	stq	$lo1,-8($tp)
+	nop
+	unop
+	bne	$tj,.L1st
+	.set	reorder
+
+	addq	$alo,$hi0,$lo0
+	addq	$nlo,$hi1,$lo1
+	cmpult	$lo0,$hi0,AT
+	cmpult	$lo1,$hi1,v0
+	addq	$ahi,AT,$hi0
+	addq	$nhi,v0,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,v0
+	addq	$hi1,v0,$hi1
+
+	stq	$lo1,0($tp)
+
+	addq	$hi1,$hi0,$hi1
+	cmpult	$hi1,$hi0,AT
+	stq	$hi1,8($tp)
+	stq	AT,16($tp)
+
+	mov	1,$i
+.align	4
+.Louter:
+	s8addq	$i,$bp,$bi
+	ldq	$hi0,0($ap)
+	ldq	$aj,8($ap)
+	ldq	$bi,0($bi)
+	ldq	$hi1,0($np)
+	ldq	$nj,8($np)
+	ldq	$tj,0(sp)
+
+	mulq	$hi0,$bi,$lo0
+	umulh	$hi0,$bi,$hi0
+
+	addq	$lo0,$tj,$lo0
+	cmpult	$lo0,$tj,AT
+	addq	$hi0,AT,$hi0
+
+	mulq	$lo0,$n0,$m1
+
+	mulq	$hi1,$m1,$lo1
+	umulh	$hi1,$m1,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,AT
+	mov	2,$j
+	addq	$hi1,AT,$hi1
+
+	mulq	$aj,$bi,$alo
+	mov	sp,$tp
+	umulh	$aj,$bi,$ahi
+
+	mulq	$nj,$m1,$nlo
+	s8addq	$j,$ap,$aj
+	umulh	$nj,$m1,$nhi
+.align	4
+.Linner:
+	.set	noreorder
+	ldq	$tj,8($tp)	#L0
+	nop			#U1
+	ldq	$aj,0($aj)	#L1
+	s8addq	$j,$np,$nj	#U0
+
+	ldq	$nj,0($nj)	#L0
+	nop			#U1
+	addq	$alo,$hi0,$lo0	#L1
+	lda	$tp,8($tp)
+
+	mulq	$aj,$bi,$alo	#U1
+	cmpult	$lo0,$hi0,AT	#L0
+	addq	$nlo,$hi1,$lo1	#L1
+	addl	$j,1,$j
+
+	mulq	$nj,$m1,$nlo	#U1
+	addq	$ahi,AT,$hi0	#L0
+	addq	$lo0,$tj,$lo0	#L1
+	cmpult	$lo1,$hi1,v0	#U0
+
+	umulh	$aj,$bi,$ahi	#U1
+	cmpult	$lo0,$tj,AT	#L0
+	addq	$lo1,$lo0,$lo1	#L1
+	addq	$nhi,v0,$hi1	#U0
+
+	umulh	$nj,$m1,$nhi	#U1
+	s8addq	$j,$ap,$aj	#L0
+	cmpult	$lo1,$lo0,v0	#L1
+	cmplt	$j,$num,$tj	#U0	# borrow $tj
+
+	addq	$hi0,AT,$hi0	#L0
+	addq	$hi1,v0,$hi1	#U1
+	stq	$lo1,-8($tp)	#L1
+	bne	$tj,.Linner	#U0
+	.set	reorder
+
+	ldq	$tj,8($tp)
+	addq	$alo,$hi0,$lo0
+	addq	$nlo,$hi1,$lo1
+	cmpult	$lo0,$hi0,AT
+	cmpult	$lo1,$hi1,v0
+	addq	$ahi,AT,$hi0
+	addq	$nhi,v0,$hi1
+
+	addq	$lo0,$tj,$lo0
+	cmpult	$lo0,$tj,AT
+	addq	$hi0,AT,$hi0
+
+	ldq	$tj,16($tp)
+	addq	$lo1,$lo0,$j
+	cmpult	$j,$lo0,v0
+	addq	$hi1,v0,$hi1
+
+	addq	$hi1,$hi0,$lo1
+	stq	$j,0($tp)
+	cmpult	$lo1,$hi0,$hi1
+	addq	$lo1,$tj,$lo1
+	cmpult	$lo1,$tj,AT
+	addl	$i,1,$i
+	addq	$hi1,AT,$hi1
+	stq	$lo1,8($tp)
+	cmplt	$i,$num,$tj	# borrow $tj
+	stq	$hi1,16($tp)
+	bne	$tj,.Louter
+
+	s8addq	$num,sp,$tj	# &tp[num]
+	mov	$rp,$bp		# put rp aside
+	mov	sp,$tp
+	mov	sp,$ap
+	mov	0,$hi0		# clear borrow bit
+
+.align	4
+.Lsub:	ldq	$lo0,0($tp)
+	ldq	$lo1,0($np)
+	lda	$tp,8($tp)
+	lda	$np,8($np)
+	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
+	cmpult	$lo0,$lo1,AT
+	subq	$lo1,$hi0,$lo0
+	cmpult	$lo1,$lo0,$hi0
+	or	$hi0,AT,$hi0
+	stq	$lo0,0($rp)
+	cmpult	$tp,$tj,v0
+	lda	$rp,8($rp)
+	bne	v0,.Lsub
+
+	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
+	mov	sp,$tp
+	mov	$bp,$rp		# restore rp
+
+	and	sp,$hi0,$ap
+	bic	$bp,$hi0,$bp
+	bis	$bp,$ap,$ap	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	ldq	$aj,0($ap)	# copy or in-place refresh
+	lda	$tp,8($tp)
+	lda	$rp,8($rp)
+	lda	$ap,8($ap)
+	stq	zero,-8($tp)	# zap tp
+	cmpult	$tp,$tj,AT
+	stq	$aj,-8($rp)
+	bne	AT,.Lcopy
+	mov	1,v0
+
+.Lexit:
+	.set	noreorder
+	mov	fp,sp
+	/*ldq	ra,0(sp)*/
+	ldq	s3,8(sp)
+	ldq	s4,16(sp)
+	ldq	s5,24(sp)
+	ldq	fp,32(sp)
+	lda	sp,48(sp)
+	ret	(ra)
+.end	bn_mul_mont
+.ascii	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+print $code;
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/armv4-gf2m.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/armv4-gf2m.pl
@@ -0,0 +1,289 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... Except that it has two code paths: pure
+# integer code suitable for any ARMv4 and later CPU and NEON code
+# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
+# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
+# faster than compiler-generated code. For ECDH and ECDSA verify (but
+# not for ECDSA sign) it means 25%-45% improvement depending on key
+# length, more for longer keys. Even though NEON 1x1 multiplication
+# runs in even less cycles, ~30, improvement is measurable only on
+# longer keys. One has to optimize code elsewhere to get NEON glow...
+#
+# April 2014
+#
+# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
+# referred below, which improves ECDH and ECDSA verify benchmarks
+# by 18-40%.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+# 
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
+
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
+.type	mul_1x1_ialu,%function
+.align	5
+mul_1x1_ialu:
+	mov	$a0,#0
+	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
+	str	$a0,[sp,#0]		@ tab[0]=0
+	add	$a2,$a1,$a1		@ a2=a1<<1
+	str	$a1,[sp,#4]		@ tab[1]=a1
+	eor	$a12,$a1,$a2		@ a1^a2
+	str	$a2,[sp,#8]		@ tab[2]=a2
+	mov	$a4,$a1,lsl#2		@ a4=a1<<2
+	str	$a12,[sp,#12]		@ tab[3]=a1^a2
+	eor	$a14,$a1,$a4		@ a1^a4
+	str	$a4,[sp,#16]		@ tab[4]=a4
+	eor	$a0,$a2,$a4		@ a2^a4
+	str	$a14,[sp,#20]		@ tab[5]=a1^a4
+	eor	$a12,$a12,$a4		@ a1^a2^a4
+	str	$a0,[sp,#24]		@ tab[6]=a2^a4
+	and	$i0,$mask,$b,lsl#2
+	str	$a12,[sp,#28]		@ tab[7]=a1^a2^a4
+
+	and	$i1,$mask,$b,lsr#1
+	ldr	$lo,[sp,$i0]		@ tab[b       & 0x7]
+	and	$i0,$mask,$b,lsr#4
+	ldr	$t1,[sp,$i1]		@ tab[b >>  3 & 0x7]
+	and	$i1,$mask,$b,lsr#7
+	ldr	$t0,[sp,$i0]		@ tab[b >>  6 & 0x7]
+	eor	$lo,$lo,$t1,lsl#3	@ stall
+	mov	$hi,$t1,lsr#29
+	ldr	$t1,[sp,$i1]		@ tab[b >>  9 & 0x7]
+
+	and	$i0,$mask,$b,lsr#10
+	eor	$lo,$lo,$t0,lsl#6
+	eor	$hi,$hi,$t0,lsr#26
+	ldr	$t0,[sp,$i0]		@ tab[b >> 12 & 0x7]
+
+	and	$i1,$mask,$b,lsr#13
+	eor	$lo,$lo,$t1,lsl#9
+	eor	$hi,$hi,$t1,lsr#23
+	ldr	$t1,[sp,$i1]		@ tab[b >> 15 & 0x7]
+
+	and	$i0,$mask,$b,lsr#16
+	eor	$lo,$lo,$t0,lsl#12
+	eor	$hi,$hi,$t0,lsr#20
+	ldr	$t0,[sp,$i0]		@ tab[b >> 18 & 0x7]
+
+	and	$i1,$mask,$b,lsr#19
+	eor	$lo,$lo,$t1,lsl#15
+	eor	$hi,$hi,$t1,lsr#17
+	ldr	$t1,[sp,$i1]		@ tab[b >> 21 & 0x7]
+
+	and	$i0,$mask,$b,lsr#22
+	eor	$lo,$lo,$t0,lsl#18
+	eor	$hi,$hi,$t0,lsr#14
+	ldr	$t0,[sp,$i0]		@ tab[b >> 24 & 0x7]
+
+	and	$i1,$mask,$b,lsr#25
+	eor	$lo,$lo,$t1,lsl#21
+	eor	$hi,$hi,$t1,lsr#11
+	ldr	$t1,[sp,$i1]		@ tab[b >> 27 & 0x7]
+
+	tst	$a,#1<<30
+	and	$i0,$mask,$b,lsr#28
+	eor	$lo,$lo,$t0,lsl#24
+	eor	$hi,$hi,$t0,lsr#8
+	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]
+
+	eorne	$lo,$lo,$b,lsl#30
+	eorne	$hi,$hi,$b,lsr#2
+	tst	$a,#1<<31
+	eor	$lo,$lo,$t1,lsl#27
+	eor	$hi,$hi,$t1,lsr#5
+	eorne	$lo,$lo,$b,lsl#31
+	eorne	$hi,$hi,$b,lsr#1
+	eor	$lo,$lo,$t0,lsl#30
+	eor	$hi,$hi,$t0,lsr#2
+
+	mov	pc,lr
+.size	mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void	bn_GF2m_mul_2x2(BN_ULONG *r,
+#	BN_ULONG a1,BN_ULONG a0,
+#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0
+{
+$code.=<<___;
+.global	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,%function
+.align	5
+bn_GF2m_mul_2x2:
+#if __ARM_MAX_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+.Lpic:	ldr	r12,[pc,r12]
+	tst	r12,#1
+	bne	.LNEON
+#endif
+___
+$ret="r10";	# reassigned 1st argument
+$code.=<<___;
+	stmdb	sp!,{r4-r10,lr}
+	mov	$ret,r0			@ reassign 1st argument
+	mov	$b,r3			@ $b=b1
+	ldr	r3,[sp,#32]		@ load b0
+	mov	$mask,#7<<2
+	sub	sp,sp,#32		@ allocate tab[8]
+
+	bl	mul_1x1_ialu		@ a1·b1
+	str	$lo,[$ret,#8]
+	str	$hi,[$ret,#12]
+
+	eor	$b,$b,r3		@ flip b0 and b1
+	 eor	$a,$a,r2		@ flip a0 and a1
+	eor	r3,r3,$b
+	 eor	r2,r2,$a
+	eor	$b,$b,r3
+	 eor	$a,$a,r2
+	bl	mul_1x1_ialu		@ a0·b0
+	str	$lo,[$ret]
+	str	$hi,[$ret,#4]
+
+	eor	$a,$a,r2
+	eor	$b,$b,r3
+	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+	ldmia	$ret,{@r[0]-@r[3]}
+	eor	$lo,$lo,$hi
+	eor	$hi,$hi,@r[1]
+	eor	$lo,$lo,@r[0]
+	eor	$hi,$hi,@r[2]
+	eor	$lo,$lo,@r[3]
+	eor	$hi,$hi,@r[3]
+	str	$hi,[$ret,#8]
+	eor	$lo,$lo,$hi
+	add	sp,sp,#32		@ destroy tab[8]
+	str	$lo,[$ret,#4]
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r10,pc}
+#else
+	ldmia	sp!,{r4-r10,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+___
+}
+{
+my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
+my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.align	5
+.LNEON:
+	ldr		r12, [sp]		@ 5th argument
+	vmov.32		$a, r2, r1
+	vmov.32		$b, r12, r3
+	vmov.i64	$k48, #0x0000ffffffffffff
+	vmov.i64	$k32, #0x00000000ffffffff
+	vmov.i64	$k16, #0x000000000000ffff
+
+	vext.8		$t0#lo, $a, $a, #1	@ A1
+	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
+	vext.8		$r#lo, $b, $b, #1	@ B1
+	vmull.p8	$r, $a, $r#lo		@ E = A*B1
+	vext.8		$t1#lo, $a, $a, #2	@ A2
+	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
+	vext.8		$t3#lo, $b, $b, #2	@ B2
+	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
+	vext.8		$t2#lo, $a, $a, #3	@ A3
+	veor		$t0, $t0, $r		@ L = E + F
+	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
+	vext.8		$r#lo, $b, $b, #3	@ B3
+	veor		$t1, $t1, $t3		@ M = G + H
+	vmull.p8	$r, $a, $r#lo		@ I = A*B3
+	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
+	vand		$t0#hi, $t0#hi, $k48
+	vext.8		$t3#lo, $b, $b, #4	@ B4
+	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
+	vand		$t1#hi, $t1#hi, $k32
+	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
+	veor		$t2, $t2, $r		@ N = I + J
+	veor		$t0#lo, $t0#lo, $t0#hi
+	veor		$t1#lo, $t1#lo, $t1#hi
+	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
+	vand		$t2#hi, $t2#hi, $k16
+	vext.8		$t0, $t0, $t0, #15
+	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	$t3#hi, #0
+	vext.8		$t1, $t1, $t1, #14
+	veor		$t2#lo, $t2#lo, $t2#hi
+	vmull.p8	$r, $a, $b		@ D = A*B
+	vext.8		$t3, $t3, $t3, #12
+	vext.8		$t2, $t2, $t2, #13
+	veor		$t0, $t0, $t1
+	veor		$t2, $t2, $t3
+	veor		$r, $r, $t0
+	veor		$r, $r, $t2
+
+	vst1.32		{$r}, [r0]
+	ret		@ bx lr
+#endif
+___
+}
+$code.=<<___;
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_MAX_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	5
+
+#if __ARM_MAX_ARCH__>=7
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+close STDOUT;   # enforce flush
--- a/openssl-1.0.2f/crypto/bn/asm/armv4-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/armv4-mont.pl
@@ -0,0 +1,676 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an 
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+# November 2013
+#
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
+# performance improvement on Cortex-A8 is ~45-100% depending on key
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
+# On Snapdragon S4 improvement was measured to vary from ~70% to
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
+# rather because original integer-only code seems to perform
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
+# different. It's being looked into, but the trouble is that
+# performance for vectors longer than 256 bits is actually couple
+# of percent worse than for integer-only code. The code is chosen
+# for execution on all NEON-capable processors, because gain on
+# others outweighs the marginal loss on Cortex-A9.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$num="r0";	# starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10";	# sl, gcc uses it to keep @GOT
+$ahi="r11";	# fp
+$nlo="r12";	# ip
+###########	# r13 is stack pointer
+$nhi="r14";	# lr
+###########	# r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4";	$_bpend=$_num;
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+#if __ARM_MAX_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-bn_mul_mont
+#endif
+
+.global	bn_mul_mont
+.type	bn_mul_mont,%function
+
+.align	5
+bn_mul_mont:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+#if __ARM_MAX_ARCH__>=7
+	tst	ip,#7
+	bne	.Lialu
+	adr	r0,bn_mul_mont
+	ldr	r2,.LOPENSSL_armcap
+	ldr	r0,[r0,r2]
+	tst	r0,#1			@ NEON available?
+	ldmia	sp, {r0,r2}
+	beq	.Lialu
+	add	sp,sp,#8
+	b	bn_mul8x_mont_neon
+.align	4
+.Lialu:
+#endif
+	cmp	ip,#2
+	mov	$num,ip			@ load num
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
+
+	mov	$num,$num,lsl#2		@ rescale $num for byte count
+	sub	sp,sp,$num		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	$num,$num,#4		@ "num=num-1"
+	add	$tp,$bp,$num		@ &bp[num-1]
+
+	add	$num,sp,$num		@ $num to point at &tp[num-1]
+	ldr	$n0,[$_n0]		@ &n0
+	ldr	$bi,[$bp]		@ bp[0]
+	ldr	$aj,[$ap],#4		@ ap[0],ap++
+	ldr	$nj,[$np],#4		@ np[0],np++
+	ldr	$n0,[$n0]		@ *n0
+	str	$tp,[$_bpend]		@ save &bp[num]
+
+	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
+	str	$n0,[$_n0]		@ save n0 value
+	mul	$n0,$alo,$n0		@ "tp[0]"*n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
+	mov	$tp,sp
+
+.L1st:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	mov	$alo,$ahi
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.L1st
+
+	adds	$nlo,$nlo,$ahi
+	ldr	$tp,[$_bp]		@ restore bp
+	mov	$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+.Louter:
+	sub	$tj,$num,sp		@ "original" $num-1 value
+	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
+	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
+	ldr	$aj,[$ap,#-4]		@ ap[0]
+	ldr	$alo,[sp]		@ tp[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
+	ldr	$tj,[sp,#4]		@ tp[1]
+
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
+	str	$tp,[$_bp]		@ save bp
+	mul	$n0,$alo,$n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
+	mov	$tp,sp
+
+.Linner:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	adds	$alo,$ahi,$tj		@ +=tp[j]
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adc	$ahi,$ahi,#0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.Linner
+
+	adds	$nlo,$nlo,$ahi
+	mov	$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
+	adc	$nhi,$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adds	$nlo,$nlo,$tj
+	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+	cmp	$tp,$tj
+	bne	.Louter
+
+	ldr	$rp,[$_rp]		@ pull rp
+	add	$num,$num,#4		@ $num to point at &tp[num]
+	sub	$aj,$num,sp		@ "original" num value
+	mov	$tp,sp			@ "rewind" $tp
+	mov	$ap,$tp			@ "borrow" $ap
+	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
+
+	subs	$tj,$tj,$tj		@ "clear" carry flag
+.Lsub:	ldr	$tj,[$tp],#4
+	ldr	$nj,[$np],#4
+	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
+	str	$tj,[$rp],#4		@ rp[j]=
+	teq	$tp,$num		@ preserve carry
+	bne	.Lsub
+	sbcs	$nhi,$nhi,#0		@ upmost carry
+	mov	$tp,sp			@ "rewind" $tp
+	sub	$rp,$rp,$aj		@ "rewind" $rp
+
+	and	$ap,$tp,$nhi
+	bic	$np,$rp,$nhi
+	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
+
+.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
+	str	sp,[$tp],#4		@ zap tp
+	str	$tj,[$rp],#4
+	cmp	$tp,$num
+	bne	.Lcopy
+
+	add	sp,$num,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4-r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+	ret				@ bx lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_mul_mont,.-bn_mul_mont
+___
+{
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
+my ($Z,$Temp)=("q4","q5");
+my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
+my $zero=&Dlo($Z);
+my $temp=&Dlo($Temp);
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.type	bn_mul8x_mont_neon,%function
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4-r11}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	ldmia	ip,{r4-r5}		@ load rest of parameter block
+
+	sub		$toutptr,sp,#16
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	sub		$toutptr,$toutptr,$num,lsl#4
+	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
+	and		$toutptr,$toutptr,#-64
+	vld1.32		{${M0}[0]}, [$n0,:32]
+	mov		sp,$toutptr			@ alloca
+	veor		$zero,$zero,$zero
+	subs		$inner,$num,#8
+	vzip.16		$Bi,$zero
+
+	vmull.u32	$A0xB,$Bi,${A0}[0]
+	vmull.u32	$A1xB,$Bi,${A0}[1]
+	vmull.u32	$A2xB,$Bi,${A1}[0]
+	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
+	vmull.u32	$A3xB,$Bi,${A1}[1]
+
+	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	veor		$zero,$zero,$zero
+	vmul.u32	$Ni,$temp,$M0
+
+	vmull.u32	$A4xB,$Bi,${A2}[0]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+	vmull.u32	$A5xB,$Bi,${A2}[1]
+	vmull.u32	$A6xB,$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmull.u32	$A7xB,$Bi,${A3}[1]
+
+	bne	.LNEON_1st
+
+	@ special case for num=8, everything is in register bank...
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	sub		$outer,$num,#1
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vmov		$Temp,$A0xB
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vmov		$A0xB,$A1xB
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vmov		$A1xB,$A2xB
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	vmov		$A2xB,$A3xB
+	vmov		$A3xB,$A4xB
+	vshr.u64	$temp,$temp,#16
+	vmov		$A4xB,$A5xB
+	vmov		$A5xB,$A6xB
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	vmov		$A6xB,$A7xB
+	veor		$A7xB,$A7xB
+	vshr.u64	$temp,$temp,#16
+
+	b	.LNEON_outer8
+
+.align	4
+.LNEON_outer8:
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	veor		$zero,$zero,$zero
+	vzip.16		$Bi,$zero
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+	vmlal.u32	$A0xB,$Bi,${A0}[0]
+	vmlal.u32	$A1xB,$Bi,${A0}[1]
+	vmlal.u32	$A2xB,$Bi,${A1}[0]
+	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
+	vmlal.u32	$A3xB,$Bi,${A1}[1]
+
+	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	veor		$zero,$zero,$zero
+	subs		$outer,$outer,#1
+	vmul.u32	$Ni,$temp,$M0
+
+	vmlal.u32	$A4xB,$Bi,${A2}[0]
+	vmlal.u32	$A5xB,$Bi,${A2}[1]
+	vmlal.u32	$A6xB,$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmlal.u32	$A7xB,$Bi,${A3}[1]
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vmov		$Temp,$A0xB
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vmov		$A0xB,$A1xB
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vmov		$A1xB,$A2xB
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	vmov		$A2xB,$A3xB
+	vmov		$A3xB,$A4xB
+	vshr.u64	$temp,$temp,#16
+	vmov		$A4xB,$A5xB
+	vmov		$A5xB,$A6xB
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	vmov		$A6xB,$A7xB
+	veor		$A7xB,$A7xB
+	vshr.u64	$temp,$temp,#16
+
+	bne	.LNEON_outer8
+
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+	mov		$toutptr,sp
+	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
+	mov		$inner,$num
+	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+	add		$tinptr,sp,#16
+	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
+	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+	b	.LNEON_tail2
+
+.align	4
+.LNEON_1st:
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	 vld1.32	{$A0-$A3}, [$aptr]!
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	subs		$inner,$inner,#8
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	 vld1.32	{$N0-$N1}, [$nptr]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
+
+	vmull.u32	$A0xB,$Bi,${A0}[0]
+	 vld1.32	{$N2-$N3}, [$nptr]!
+	vmull.u32	$A1xB,$Bi,${A0}[1]
+	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
+	vmull.u32	$A2xB,$Bi,${A1}[0]
+	vmull.u32	$A3xB,$Bi,${A1}[1]
+	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
+
+	vmull.u32	$A4xB,$Bi,${A2}[0]
+	vmull.u32	$A5xB,$Bi,${A2}[1]
+	vmull.u32	$A6xB,$Bi,${A3}[0]
+	vmull.u32	$A7xB,$Bi,${A3}[1]
+
+	bne	.LNEON_1st
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	add		$tinptr,sp,#16
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	 vld1.64	{$Temp}, [sp,:128]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+	sub		$outer,$num,#1
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vshr.u64	$temp,$temp,#16
+	 vld1.64	{$A0xB},       [$tinptr, :128]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+
+	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	veor		$Z,$Z,$Z
+	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
+	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
+	vst1.64		{$Z},          [$toutptr,:128]
+	vshr.u64	$temp,$temp,#16
+
+	b		.LNEON_outer
+
+.align	4
+.LNEON_outer:
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
+	vld1.32		{$A0-$A3},  [$aptr]!
+	veor		$zero,$zero,$zero
+	mov		$toutptr,sp
+	vzip.16		$Bi,$zero
+	sub		$inner,$num,#8
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+	vmlal.u32	$A0xB,$Bi,${A0}[0]
+	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
+	vmlal.u32	$A1xB,$Bi,${A0}[1]
+	vmlal.u32	$A2xB,$Bi,${A1}[0]
+	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
+	vmlal.u32	$A3xB,$Bi,${A1}[1]
+
+	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
+	veor		$zero,$zero,$zero
+	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	 vld1.64	{$A7xB},[$tinptr,:128]!
+	vmul.u32	$Ni,$temp,$M0
+
+	vmlal.u32	$A4xB,$Bi,${A2}[0]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+	vmlal.u32	$A5xB,$Bi,${A2}[1]
+	vmlal.u32	$A6xB,$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmlal.u32	$A7xB,$Bi,${A3}[1]
+
+.LNEON_inner:
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	 vld1.32	{$A0-$A3}, [$aptr]!
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	 subs		$inner,$inner,#8
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	 vld1.64	{$A0xB},       [$tinptr, :128]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
+
+	vmlal.u32	$A0xB,$Bi,${A0}[0]
+	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
+	vmlal.u32	$A1xB,$Bi,${A0}[1]
+	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
+	vmlal.u32	$A2xB,$Bi,${A1}[0]
+	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
+	vmlal.u32	$A3xB,$Bi,${A1}[1]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+
+	vmlal.u32	$A4xB,$Bi,${A2}[0]
+	 vld1.64	{$A7xB},       [$tinptr, :128]!
+	vmlal.u32	$A5xB,$Bi,${A2}[1]
+	vmlal.u32	$A6xB,$Bi,${A3}[0]
+	vmlal.u32	$A7xB,$Bi,${A3}[1]
+
+	bne	.LNEON_inner
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	add		$tinptr,sp,#16
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	 vld1.64	{$Temp}, [sp,:128]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+	subs		$outer,$outer,#1
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	 vld1.64	{$A0xB},       [$tinptr, :128]!
+	vshr.u64	$temp,$temp,#16
+	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+
+	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
+	vshr.u64	$temp,$temp,#16
+
+	bne	.LNEON_outer
+
+	mov		$toutptr,sp
+	mov		$inner,$num
+
+.LNEON_tail:
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
+	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
+	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
+	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
+	vld1.64		{$A7xB},       [$tinptr, :128]!
+	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+.LNEON_tail2:
+	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
+	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
+	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
+	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
+
+	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
+	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
+	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
+	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
+
+	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
+	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
+	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
+	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
+
+	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
+	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
+	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
+	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
+
+	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
+	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
+	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
+	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
+
+	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
+	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
+	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
+	vld1.64		{$A0xB}, [$tinptr, :128]!
+	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
+	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
+
+	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
+	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
+	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
+	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
+	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
+	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
+	subs		$inner,$inner,#8
+	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
+
+	bne	.LNEON_tail
+
+	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
+	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
+	subs	$aptr,sp,#0				@ clear carry flag
+	add	$bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$nptr!, {r8-r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_sub
+
+	ldr	r10, [$aptr]				@ load top-most bit
+	veor	q0,q0,q0
+	sub	r11,$bptr,sp				@ this is num*4
+	veor	q1,q1,q1
+	mov	$aptr,sp
+	sub	$rptr,$rptr,r11				@ rewind $rptr
+	mov	$nptr,$bptr				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+.LNEON_copy_n_zap:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$rptr,  {r8-r11}
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	movcc	r11,r7
+	ldmia	$aptr, {r4-r7}
+	stmia	$rptr!, {r8-r11}
+	sub	$aptr,$aptr,#16
+	ldmia	$rptr, {r8-r11}
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	movcc	r11,r7
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_copy_n_zap
+
+	sub	sp,ip,#96
+        vldmia  sp!,{d8-d15}
+        ldmia   sp!,{r4-r11}
+	ret						@ bx lr
+.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
+print $code;
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/bn-586.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/bn-586.pl
@@ -0,0 +1,774 @@
+#!/usr/local/bin/perl
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_sub_part_words("bn_sub_part_words");
+
+&asm_finish();
+
+sub bn_mul_add_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("maw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry_in
+		&jmp(&label("maw_sse2_entry"));
+		
+	&set_label("maw_sse2_unrolled",16);
+		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
+		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
+		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
+		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
+		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
+		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
+		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
+		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
+		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
+		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
+		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
+		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
+		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
+		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
+		&movd(&DWP(0,$r,"",0),"mm1");
+		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
+		&psrlq("mm1",32);		# mm1 = carry0
+		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
+		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
+		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
+		&movd(&DWP(4,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry1
+		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
+		&add($a,32);
+		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
+		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
+		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
+		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
+		&movd(&DWP(8,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry2
+		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
+		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
+		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
+		&movd(&DWP(12,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry3
+		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
+		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
+		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
+		&movd(&DWP(16,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry4
+		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
+		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
+		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
+		&movd(&DWP(20,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry5
+		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
+		&movd(&DWP(24,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry6
+		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
+		&movd(&DWP(28,$r,"",0),"mm1");
+		&lea($r,&DWP(32,$r));
+		&psrlq("mm1",32);		# mm1 = carry_out
+
+		&sub($c,8);
+		&jz(&label("maw_sse2_exit"));
+	&set_label("maw_sse2_entry");
+		&test($c,0xfffffff8);
+		&jnz(&label("maw_sse2_unrolled"));
+
+	&set_label("maw_sse2_loop",4);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm3");		# carry += r[i]
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("maw_sse2_loop"));
+	&set_label("maw_sse2_exit");
+		&movd("eax","mm1");		# c = carry_out
+		&emms();
+		&ret();
+
+	&set_label("maw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ebp";
+	$r="edi";
+	$c="esi";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+
+	&mov("ecx",&wparam(2));	#
+	&mov($a,&wparam(1));	#
+
+	&and("ecx",0xfffffff8);	# num / 8
+	&mov($w,&wparam(3));	#
+
+	&push("ecx");		# Up the stack for a tmp variable
+
+	&jz(&label("maw_finish"));
+
+	&set_label("maw_loop",16);
+
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+= c
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",&DWP($i,$r));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&sub("ecx",8);
+	&lea($a,&DWP(32,$a));
+	&lea($r,&DWP(32,$r));
+	&jnz(&label("maw_loop"));
+
+	&set_label("maw_finish",0);
+	&mov("ecx",&wparam(2));	# get num
+	&and("ecx",7);
+	&jnz(&label("maw_finish2"));	# helps branch prediction
+	&jmp(&label("maw_end"));
+
+	&set_label("maw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a));	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &dec("ecx") if ($i != 7-1);
+		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
+		 &mov($c,"edx");		# c=  H(t);
+		&jz(&label("maw_end")) if ($i != 7-1);
+		}
+	&set_label("maw_end",0);
+	&mov("eax",$c);
+
+	&pop("ecx");	# clear variable from
+
+	&function_end($name);
+	}
+
+sub bn_mul_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("mw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry = 0
+
+	&set_label("mw_sse2_loop",16);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("mw_sse2_loop"));
+
+		&movd("eax","mm1");		# return carry
+		&emms();
+		&ret();
+	&set_label("mw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ecx";
+	$r="edi";
+	$c="esi";
+	$num="ebp";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+	&mov($w,&wparam(3));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("mw_finish"));
+
+	&set_label("mw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jz(&label("mw_finish"));
+	&jmp(&label("mw_loop"));
+
+	&set_label("mw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jnz(&label("mw_finish2"));
+	&jmp(&label("mw_end"));
+
+	&set_label("mw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		 &dec($num) if ($i != 7-1);
+		&jz(&label("mw_end")) if ($i != 7-1);
+		}
+	&set_label("mw_end",0);
+	&mov("eax",$c);
+
+	&function_end($name);
+	}
+
+sub bn_sqr_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("sqr_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+
+	&set_label("sqr_sse2_loop",16);
+		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
+		&pmuludq("mm0","mm0");		# a[i] *= a[i]
+		&lea($a,&DWP(4,$a));		# a++
+		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
+		&sub($c,1);
+		&lea($r,&DWP(8,$r));		# r += 2
+		&jnz(&label("sqr_sse2_loop"));
+
+		&emms();
+		&ret();
+	&set_label("sqr_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$r="esi";
+	$a="edi";
+	$num="ebx";
+
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("sw_finish"));
+
+	&set_label("sw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+		&mov("eax",&DWP($i,$a,"",0)); 	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*2,$r,"",0),"eax");	#
+		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,64);
+	&sub($num,8);
+	&jnz(&label("sw_loop"));
+
+	&set_label("sw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jz(&label("sw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov("eax",&DWP($i*4,$a,"",0));	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*8,$r,"",0),"eax");	#
+		 &dec($num) if ($i != 7-1);
+		&mov(&DWP($i*8+4,$r,"",0),"edx");
+		 &jz(&label("sw_end")) if ($i != 7-1);
+		}
+	&set_label("sw_end",0);
+
+	&function_end($name);
+	}
+
+sub bn_div_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,"");
+	&mov("edx",&wparam(0));	#
+	&mov("eax",&wparam(1));	#
+	&mov("ecx",&wparam(2));	#
+	&div("ecx");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+sub bn_sub_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+sub bn_sub_part_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP(0,$a,"",0));	# *a
+		 &mov($tmp2,&DWP(0,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
+		&add($a, 4);
+		&add($b, 4);
+		&add($r, 4);
+		 &dec($num) if ($i != 6);
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+	&cmp(&wparam(4),0);
+	&je(&label("pw_end"));
+
+	&mov($num,&wparam(4));	# get dl
+	&cmp($num,0);
+	&je(&label("pw_end"));
+	&jge(&label("pw_pos"));
+
+	&comment("pw_neg");
+	&mov($tmp2,0);
+	&sub($tmp2,$num);
+	&mov($num,$tmp2);
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("pw_neg_finish"));
+
+	&set_label("pw_neg_loop",0);
+	for ($i=0; $i<8; $i++)
+	{
+	    &comment("dl<0 Round $i");
+
+	    &mov($tmp1,0);
+	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+	    &sub($tmp1,$c);
+	    &mov($c,0);
+	    &adc($c,$c);
+	    &sub($tmp1,$tmp2);
+	    &adc($c,0);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+	}
+	    
+	&comment("");
+	&add($b,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_neg_loop"));
+	    
+	&set_label("pw_neg_finish",0);
+	&mov($tmp2,&wparam(4));	# get dl
+	&mov($num,0);
+	&sub($num,$tmp2);
+	&and($num,7);
+	&jz(&label("pw_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &comment("dl<0 Tail Round $i");
+	    &mov($tmp1,0);
+	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+	    &sub($tmp1,$c);
+	    &mov($c,0);
+	    &adc($c,$c);
+	    &sub($tmp1,$tmp2);
+	    &adc($c,0);
+	    &dec($num) if ($i != 6);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jz(&label("pw_end")) if ($i != 6);
+	}
+
+	&jmp(&label("pw_end"));
+	
+	&set_label("pw_pos",0);
+	
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("pw_pos_finish"));
+
+	&set_label("pw_pos_loop",0);
+
+	for ($i=0; $i<8; $i++)
+	{
+	    &comment("dl>0 Round $i");
+
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &sub($tmp1,$c);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jnc(&label("pw_nc".$i));
+	}
+	    
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_pos_loop"));
+	    
+	&set_label("pw_pos_finish",0);
+	&mov($num,&wparam(4));	# get dl
+	&and($num,7);
+	&jz(&label("pw_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &comment("dl>0 Tail Round $i");
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &sub($tmp1,$c);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jnc(&label("pw_tail_nc".$i));
+	    &dec($num) if ($i != 6);
+	    &jz(&label("pw_end")) if ($i != 6);
+	}
+	&mov($c,1);
+	&jmp(&label("pw_end"));
+
+	&set_label("pw_nc_loop",0);
+	for ($i=0; $i<8; $i++)
+	{
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &set_label("pw_nc".$i,0);
+	}
+	    
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_nc_loop"));
+	    
+	&mov($num,&wparam(4));	# get dl
+	&and($num,7);
+	&jz(&label("pw_nc_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &set_label("pw_tail_nc".$i,0);
+	    &dec($num) if ($i != 6);
+	    &jz(&label("pw_nc_end")) if ($i != 6);
+	}
+
+	&set_label("pw_nc_end",0);
+	&mov($c,0);
+
+	&set_label("pw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
--- a/openssl-1.0.2f/crypto/bn/asm/co-586.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/co-586.pl
@@ -0,0 +1,287 @@
+#!/usr/local/bin/perl
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
+sub mul_add_c
+	{
+	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("mul a[$ai]*b[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	&mul("edx");
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
+	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
+	}
+
+sub sqr_add_c
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
+	}
+
+sub sqr_add_c2
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$a,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add("eax","eax");
+	 ###
+	&adc("edx","edx");
+	 ###
+	&adc($c2,0);
+	 &add($c0,"eax");
+	&adc($c1,"edx");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
+	&adc($c2,0);
+	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+	 ###
+	}
+
+sub bn_mul_comba
+	{
+	local($name,$num)=@_;
+	local($a,$b,$c0,$c1,$c2);
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($tot,$end);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$b="edi";
+	
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	&push("esi");
+	 &mov($a,&wparam(1));
+	&push("edi");
+	 &mov($b,&wparam(2));
+	&push("ebp");
+	 &push("ebx");
+
+	&xor($c0,$c0);
+	 &mov("eax",&DWP(0,$a,"",0));	# load the first word 
+	&xor($c1,$c1);
+	 &mov("edx",&DWP(0,$b,"",0));	# load the first second 
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("################## Calculate word $i"); 
+
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($j+1) == $end)
+				{
+				$v=1;
+				$v=2 if (($i+1) == $tot);
+				}
+			else
+				{ $v=0; }
+			if (($j+1) != $end)
+				{
+				$na=($ai-1);
+				$nb=($bi+1);
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				# &mov("eax",&wparam(0));
+				# &mov(&DWP($i*4,"eax","",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&comment("save r[$i]");
+	# &mov("eax",&wparam(0));
+	&mov(&DWP($i*4,"eax","",0),$c0);
+
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_sqr_comba
+	{
+	local($name,$num)=@_;
+	local($r,$a,$c0,$c1,$c2)=@_;
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($b,$tot,$end,$half);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$r="edi";
+
+	&push("esi");
+	 &push("edi");
+	&push("ebp");
+	 &push("ebx");
+	&mov($r,&wparam(0));
+	 &mov($a,&wparam(1));
+	&xor($c0,$c0);
+	 &xor($c1,$c1);
+	&mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("############### Calculate word $i");
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($ai-1) < ($bi+1))
+				{
+				$v=1;
+				$v=2 if ($i+1) == $tot;
+				}
+			else
+				{ $v=0; }
+			if (!$v)
+				{
+				$na=$ai-1;
+				$nb=$bi+1;
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+			if ($ai == $bi)
+				{
+				&sqr_add_c($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			else
+				{
+				&sqr_add_c2($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				#&mov(&DWP($i*4,$r,"",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				last;
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&mov(&DWP($i*4,$r,"",0),$c0);
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
--- a/openssl-1.0.2f/crypto/bn/asm/ia64-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+#   stalls after ldf8, xma and getf.sig outside inner loop and
+#   improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+#   once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+#   acute interest, because upcoming Tukwila's individual cores are
+#   reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
+# rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
+# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
+# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
+# dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
+# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
+# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
+#
+# ... and *without* (but still with ia64.S):
+#
+# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
+# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
+# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
+# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
+# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
+# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
+# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
+#
+# As it can be seen, RSA sign performance improves by 130-30%,
+# hereafter less for longer keys, while verify - by 74-13%.
+# DSA performance improves by 115-30%.
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+//		    const BN_ULONG *bp,const BN_ULONG *np,
+//		    const BN_ULONG *n0p,int num);			
+.align	64
+.global	bn_mul_mont#
+.proc	bn_mul_mont#
+bn_mul_mont:
+	.prologue
+	.body
+{ .mmi;	cmp4.le		p6,p7=2,r37;;
+(p6)	cmp4.lt.unc	p8,p9=8,r37
+	mov		ret0=r0		};;
+{ .bbb;
+(p9)	br.cond.dptk.many	bn_mul_mont_8
+(p8)	br.cond.dpnt.many	bn_mul_mont_general
+(p7)	br.ret.spnt.many	b0	};;
+.endp	bn_mul_mont#
+
+prevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
+
+rptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
+tptr=r16;	// &tp[0]
+tp_1=r17;	// &tp[-1]
+num=r18;	len=r19;	lc=r20;
+topbit=r21;	// carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.align	64
+.local	bn_mul_mont_general#
+.proc	bn_mul_mont_general#
+bn_mul_mont_general:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,6,2,0,8
+	$ADDP	aptr=0,in1
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc		}
+{ .mmi;	.vframe	prevsp
+	mov	prevsp=sp
+	$ADDP	bptr=0,in2
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
+	.rotr		a[3],n[3],t[2]
+
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		alo[4]=[aptr],16	// ap[0]
+	$ADDP		r30=8,in1	};;
+{ .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
+	ldf8		alo[2]=[aptr],16	// ap[2]
+	$ADDP		in4=0,in4	};;
+{ .mmi;	ldf8		alo[1]=[r30]		// ap[3]
+	ldf8		n0=[in4]		// n0
+	$ADDP		rptr=0,in0		}
+{ .mmi;	$ADDP		nptr=0,in3
+	mov		r31=16
+	zxt4		num=in5		};;
+{ .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
+	shladd		len=num,3,r0
+	shladd		r31=num,3,r31	};;
+{ .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
+	add		lc=-5,num
+	sub		r31=sp,r31	};;
+{ .mfb;	and		sp=-16,r31		// alloca
+	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
+	nop.b		0		}
+{ .mfb;	nop.m		0
+	xmpy.lu		alo[4]=alo[4],bi
+	brp.loop.imp	.L1st_ctop,.L1st_cend-16
+					};;
+{ .mfi;	nop.m		0
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
+	add		tp_1=8,sp	}
+{ .mfi;	nop.m		0
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20001f<<16
+			// ------^----- (p40) at first (p23)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	nop.m		0
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;	nop.m		0
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+.align	32
+.L1st_ctop:
+.pred.rel	"mutex",p40,p42
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
+{ .mfi;	(p23)	st8		[tp_1]=n[2],8
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mmb;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	br.ctop.sptk	.L1st_ctop			};;
+.L1st_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	add		num=-1,num	};;	// num--
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+	sub		nptr=nptr,len	};;
+{ .mmi;	.pred.rel	"mutex",p39,p41
+(p39)	add		topbit=r0,r0
+(p41)	add		topbit=r0,r0,1
+	nop.i		0		}	
+{ .mmi;	st8		[tp_1]=n[0]
+	add		tptr=16,sp
+	add		tp_1=8,sp	};;
+
+.Louter:
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		ahi[3]=[tptr]		// tp[0]
+	add		r30=8,aptr	};;
+{ .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
+	ldf8		alo[3]=[r30],16		// ap[1]
+	add		r31=8,nptr	};;
+{ .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
+	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
+	brp.loop.imp	.Linner_ctop,.Linner_cend-16
+					}
+{ .mfb;	ldf8		alo[1]=[r30]		// ap[3]
+	xma.lu		alo[4]=alo[4],bi,ahi[3]
+	clrrrb.pr			};;
+{ .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
+	nop.i		0		}
+{ .mfi;	ldf8		nlo[1]=[r31]		// np[1]
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20101f<<16
+			// ------^----- (p40) at first (p23)
+			// --------^--- (p30) at first (p22)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align	32
+.Linner_ctop:
+.pred.rel	"mutex",p40,p42
+.pred.rel	"mutex",p30,p32
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
+{ .mfi;	(p16)	nop.m		0
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p16)	nop.f		0
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mfi;	(p21)	ld8		t[0]=[tptr],8
+	(p16)	nop.f		0
+	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
+{ .mfi;	(p16)	nop.m		0
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
+{ .mmi;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
+{ .mmb;	(p23)	st8		[tp_1]=n[2],8
+	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
+	br.ctop.sptk	.Linner_ctop			};;
+.Linner_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	nop.i		0		};;
+
+{ .mmi;	.pred.rel	"mutex",p31,p33
+(p31)	add		a[0]=a[0],topbit
+(p33)	add		a[0]=a[0],topbit,1
+	mov		topbit=r0	};;
+{ .mfi; .pred.rel	"mutex",p31,p33
+(p31)	cmp.ltu		p32,p30=a[0],topbit
+(p33)	cmp.leu		p32,p30=a[0],topbit
+					}
+{ .mfi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+					};;
+{ .mmi;	.pred.rel	"mutex",p44,p46
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+(p32)	add		topbit=r0,r0,1	}
+
+{ .mmi;	st8		[tp_1]=n[0],8
+	cmp4.ne		p6,p0=1,num
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	sub		nptr=nptr,len
+(p41)	add		topbit=r0,r0,1
+	add		tptr=16,sp	}
+{ .mmb;	add		tp_1=8,sp
+	add		num=-1,num		// num--
+(p6)	br.cond.sptk.many	.Louter	};;
+
+{ .mbb;	add		lc=4,lc
+	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
+	clrrrb.pr			};;
+{ .mii;	nop.m		0
+	mov		pr.rot=0x10001<<16
+			// ------^---- (p33) at first (p17)
+	mov		ar.lc=lc	}
+{ .mii;	nop.m		0
+	mov		ar.ec=3
+	nop.i		0		};;
+
+.Lsub_ctop:
+.pred.rel	"mutex",p33,p35
+{ .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
+	(p16)	nop.f		0
+	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
+{ .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
+	(p16)	nop.f		0
+	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
+{ .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
+	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
+	(p18)	nop.b		0			}
+{ .mib;	(p18)	nop.m		0
+	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
+	br.ctop.sptk	.Lsub_ctop			};;
+.Lsub_cend:
+
+{ .mmb;	.pred.rel	"mutex",p34,p36
+(p34)	sub	topbit=topbit,r0	// (p19)
+(p36)	sub	topbit=topbit,r0,1
+	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
+					}
+{ .mmb;	sub	rptr=rptr,len		// rewind
+	sub	tptr=tptr,len
+	clrrrb.pr			};;
+{ .mmi;	and	aptr=tptr,topbit
+	andcm	bptr=rptr,topbit
+	mov	pr.rot=1<<16		};;
+{ .mii;	or	nptr=aptr,bptr
+	mov	ar.lc=lc
+	mov	ar.ec=3			};;
+
+.Lcopy_ctop:
+{ .mmb;	(p16)	ld8	n[0]=[nptr],8
+	(p18)	st8	[tptr]=r0,8
+	(p16)	nop.b	0		}
+{ .mmb;	(p16)	nop.m	0
+	(p18)	st8	[rptr]=n[2],8
+	br.ctop.sptk	.Lcopy_ctop	};;
+.Lcopy_cend:
+
+{ .mmi;	mov		ret0=1			// signal "handled"
+	rum		1<<5			// clear um.mfh
+	mov		ar.lc=prevlc	}
+{ .mib;	.restore	sp
+	mov		sp=prevsp
+	mov		pr=prevpr,0x1ffff
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_general#
+
+a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
+n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
+t0=r15;
+
+ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.align	64
+.skip	48		// aligns loop body
+.local	bn_mul_mont_8#
+.proc	bn_mul_mont_8#
+bn_mul_mont_8:
+	.prologue
+{ .mmi;	.save		ar.pfs,prevfs
+	alloc		prevfs=ar.pfs,6,2,0,8
+	.vframe		prevsp
+	mov		prevsp=sp
+	.save		ar.lc,prevlc
+	mov		prevlc=ar.lc	}
+{ .mmi;	add		r17=-6*16,sp
+	add		sp=-7*16,sp
+	.save		pr,prevpr
+	mov		prevpr=pr	};;
+
+{ .mmi;	.save.gf	0,0x10
+	stf.spill	[sp]=f16,-16
+	.save.gf	0,0x20
+	stf.spill	[r17]=f17,32
+	add		r16=-5*16,prevsp};;
+{ .mmi;	.save.gf	0,0x40
+	stf.spill	[r16]=f18,32
+	.save.gf	0,0x80
+	stf.spill	[r17]=f19,32
+	$ADDP		aptr=0,in1	};;
+{ .mmi;	.save.gf	0,0x100
+	stf.spill	[r16]=f20,32
+	.save.gf	0,0x200
+	stf.spill	[r17]=f21,32
+	$ADDP		r29=8,in1	};;
+{ .mmi;	.save.gf	0,0x400
+	stf.spill	[r16]=f22
+	.save.gf	0,0x800
+	stf.spill	[r17]=f23
+	$ADDP		rptr=0,in0	};;
+
+	.body
+	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+	.rotr		t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi;	ldf8		ai0=[aptr],16		// ap[0]
+	ldf8		ai1=[r29],16		// ap[1]
+	$ADDP		bptr=0,in2	}
+{ .mmi;	$ADDP		r30=8,in2
+	$ADDP		nptr=0,in3
+	$ADDP		r31=8,in3	};;
+{ .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
+	ldf8		bj[6]=[r30],16		// bp[1]
+	cmp4.le		p4,p5=3,in5	}
+{ .mmi;	ldf8		ni0=[nptr],16		// np[0]
+	ldf8		ni1=[r31],16		// np[1]
+	cmp4.le		p6,p7=4,in5	};;
+
+{ .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
+	(p5)fcvt.fxu	ai2=f0
+	cmp4.le		p8,p9=5,in5	}
+{ .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
+	(p7)fcvt.fxu	ai3=f0
+	cmp4.le		p10,p11=6,in5	}
+{ .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
+	(p5)fcvt.fxu	bj[5]=f0
+	cmp4.le		p12,p13=7,in5	}
+{ .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
+	(p7)fcvt.fxu	bj[4]=f0
+	cmp4.le		p14,p15=8,in5	}
+{ .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
+	(p5)fcvt.fxu	ni2=f0
+	addp4		r28=-1,in5	}
+{ .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
+	(p7)fcvt.fxu	ni3=f0
+	$ADDP		in4=0,in4	};;
+
+{ .mfi;	ldf8		n0=[in4]
+	fcvt.fxu	tf[1]=f0
+	nop.i		0		}
+
+{ .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
+	(p9)fcvt.fxu	ai4=f0
+	mov		t[0]=r0		}
+{ .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
+	(p11)fcvt.fxu	ai5=f0
+	mov		t[1]=r0		}
+{ .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
+	(p9)fcvt.fxu	bj[3]=f0
+	mov		t[2]=r0		}
+{ .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
+	(p11)fcvt.fxu	bj[2]=f0
+	mov		t[3]=r0		}
+{ .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
+	(p9)fcvt.fxu	ni4=f0
+	mov		t[4]=r0		}
+{ .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
+	(p11)fcvt.fxu	ni5=f0
+	mov		t[5]=r0		};;
+
+{ .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
+	(p13)fcvt.fxu	ai6=f0
+	mov		t[6]=r0		}
+{ .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
+	(p15)fcvt.fxu	ai7=f0
+	mov		t[7]=r0		}
+{ .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
+	(p13)fcvt.fxu	bj[1]=f0
+	mov		ar.lc=r28	}
+{ .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
+	(p15)fcvt.fxu	bj[0]=f0
+	mov		ar.ec=1		}
+{ .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
+	(p13)fcvt.fxu	ni6=f0
+	mov		pr.rot=1<<16	}
+{ .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
+	(p15)fcvt.fxu	ni7=f0
+	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
+					};;
+
+// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
+// to measure with help of Interval Time Counter indicated that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*n ticks, because of higher xma latency...
+.Louter_8_ctop:
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 0:
+	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
+	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
+{ .mfi;	(p42)	add		a3=a3,n3,1
+	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 4:
+	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
+	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
+{ .mfi;	(p43)	add		a4=a4,n4,1
+	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p16)	nop.m		0			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 8:
+	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
+	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
+{ .mfi;	(p42)	add		a5=a5,n5,1
+	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mfi;	(p16)	nop.m		0			// 10:
+	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
+	(p40)	cmp.ltu		p43,p41=a5,n5	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
+	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
+	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
+{ .mfi;	(p43)	add		a6=a6,n6,1
+	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mfi;	(p16)	nop.m		0			// 14:
+	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
+	(p41)	cmp.ltu		p42,p40=a6,n6	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
+	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	nop.m		0			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 16:
+	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
+	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
+{ .mfi;	(p42)	add		a7=a7,n7,1
+	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mfi;	(p16)	nop.m		0			// 18:
+	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
+	(p40)	cmp.ltu		p43,p41=a7,n7	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
+	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 20:
+	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
+	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
+{ .mfi;	(p43)	add		a8=a8,n8,1
+	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	};;
+{ .mfi;	(p16)	nop.m		0			// 22:
+	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
+	(p41)	cmp.ltu		p42,p40=a8,n8	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
+	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	};;
+{ .mfi;	(p16)	nop.m		0			// 24:
+	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
+	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
+{ .mfi;	(p16)	nop.m		0
+	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
+	(p17)	mov		t[0]=r0		};;
+{ .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
+	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
+	(p42)	add		t[0]=t[0],r0,1	};;
+{ .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
+	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
+	(p50)	add		t[0]=t[0],r0,1	}
+{ .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
+	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
+	(p16)	cmp.ltu.unc	p50,p48=t0,a1
+	(p16)	nop.i		0		};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 28:
+	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
+	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
+{ .mfi;	(p42)	add		a2=a2,n2,1
+	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
+	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
+	(p50)	add		t[6]=t[6],a2,1	};;
+{ .mfi;	(p16)	nop.m		0			// 30:
+	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
+	(p40)	cmp.ltu		p41,p39=a2,n2	}
+{ .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
+	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
+	(p16)	nop.f		0
+	(p48)	cmp.ltu		p49,p47=t[6],a2	}
+{ .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
+	(p16)	nop.f		0
+	br.ctop.sptk.many	.Louter_8_ctop	};;
+.Louter_8_cend:
+
+// above loop has to execute one more time, without (p16), which is
+// replaced with merged move of np[8] to GPR bank
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mmi;	(p0)	getf.sig	n1=ni0			// 0:
+	(p40)	add		a3=a3,n3		//	(p17) a3+=n3
+	(p42)	add		a3=a3,n3,1	};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mmi;	(p0)	getf.sig	n2=ni1			// 4:
+	(p41)	add		a4=a4,n4		//	(p17) a4+=n4
+	(p43)	add		a4=a4,n4,1	};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p0)	nop.f		0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p0)	getf.sig	n3=ni2			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	getf.sig	n4=ni3			// 8:
+	(p40)	add		a5=a5,n5		//	(p17) a5+=n5
+	(p42)	add		a5=a5,n5,1	};;
+{ .mii;	(p0)	nop.m		0			// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mii;	(p0)	nop.m		0			// 10:
+	(p40)	cmp.ltu		p43,p41=a5,n5
+	(p42)	cmp.leu		p43,p41=a5,n5	};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p41)	add		a6=a6,n6		//	(p17) a6+=n6
+	(p43)	add		a6=a6,n6,1	};;
+{ .mii;	(p0)	getf.sig	n5=ni4			// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mii;	(p0)	nop.m		0			// 14:
+	(p41)	cmp.ltu		p42,p40=a6,n6
+	(p43)	cmp.leu		p42,p40=a6,n6	};;
+{ .mii;	(p0)	getf.sig	n6=ni5			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	nop.m		0			// 16:
+	(p40)	add		a7=a7,n7		//	(p17) a7+=n7
+	(p42)	add		a7=a7,n7,1	};;
+{ .mii;	(p0)	nop.m		0			// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mii;	(p0)	nop.m		0			// 18:
+	(p40)	cmp.ltu		p43,p41=a7,n7
+	(p42)	cmp.leu		p43,p41=a7,n7	};;
+{ .mii;	(p0)	getf.sig	n7=ni6			// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p0)	nop.m		0			// 20:
+	(p41)	add		a8=a8,n8		//	(p17) a8+=n8
+	(p43)	add		a8=a8,n8,1	};;
+{ .mmi;	(p0)	nop.m		0			// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	}
+{ .mmi;	(p17)	mov		t[0]=r0
+	(p41)	cmp.ltu		p42,p40=a8,n8
+	(p43)	cmp.leu		p42,p40=a8,n8	};;
+{ .mmi;	(p0)	getf.sig	n8=ni7			// 22:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	}
+{ .mmi;	(p42)	add		t[0]=t[0],r0,1
+	(p0)	add		r16=-7*16,prevsp
+	(p0)	add		r17=-6*16,prevsp	};;
+
+// subtract np[8] from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+//	t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
+{ .mmi;	(p50)add	t[0]=t[0],r0,1
+	add		r18=-5*16,prevsp
+	sub		n1=t0,n1	};;
+{ .mmi;	cmp.gtu		p34,p32=n1,t0;;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n2=t[7],n2
+	(p34)sub	n2=t[7],n2,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n2,t[7]
+	(p34)cmp.geu	p35,p33=n2,t[7];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n3=t[6],n3	}
+{ .mmi;	(p35)sub	n3=t[6],n3,1;;
+	(p33)cmp.gtu	p34,p32=n3,t[6]
+	(p35)cmp.geu	p34,p32=n3,t[6]	};;
+	.pred.rel	"mutex",p32,p34
+{ .mii;	(p32)sub	n4=t[5],n4
+	(p34)sub	n4=t[5],n4,1;;
+	(p32)cmp.gtu	p35,p33=n4,t[5]	}
+{ .mmi;	(p34)cmp.geu	p35,p33=n4,t[5];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n5=t[4],n5
+	(p35)sub	n5=t[4],n5,1	};;
+{ .mii;	(p33)cmp.gtu	p34,p32=n5,t[4]
+	(p35)cmp.geu	p34,p32=n5,t[4];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n6=t[3],n6	}
+{ .mmi;	(p34)sub	n6=t[3],n6,1;;
+	(p32)cmp.gtu	p35,p33=n6,t[3]
+	(p34)cmp.geu	p35,p33=n6,t[3]	};;
+	.pred.rel	"mutex",p33,p35
+{ .mii;	(p33)sub	n7=t[2],n7
+	(p35)sub	n7=t[2],n7,1;;
+	(p33)cmp.gtu	p34,p32=n7,t[2]	}
+{ .mmi;	(p35)cmp.geu	p34,p32=n7,t[2];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n8=t[1],n8
+	(p34)sub	n8=t[1],n8,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n8,t[1]
+	(p34)cmp.geu	p35,p33=n8,t[1];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	a8=t[0],r0	}
+{ .mmi;	(p35)sub	a8=t[0],r0,1;;
+	(p33)cmp.gtu	p34,p32=a8,t[0]
+	(p35)cmp.geu	p34,p32=a8,t[0]	};;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+	.pred.rel	"mutex",p32,p34
+{ .mmi;	(p32)st8	[rptr]=n1,8
+	(p34)st8	[rptr]=t0,8
+	add		r19=-4*16,prevsp};;
+{ .mmb;	(p32)st8	[rptr]=n2,8
+	(p34)st8	[rptr]=t[7],8
+	(p5)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n3,8
+	(p34)st8	[rptr]=t[6],8
+	(p7)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n4,8
+	(p34)st8	[rptr]=t[5],8
+	(p9)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n5,8
+	(p34)st8	[rptr]=t[4],8
+	(p11)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n6,8
+	(p34)st8	[rptr]=t[3],8
+	(p13)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n7,8
+	(p34)st8	[rptr]=t[2],8
+	(p15)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n8,8
+	(p34)st8	[rptr]=t[1],8
+	nop.b		0		};;
+.Ldone:						// epilogue
+{ .mmi;	ldf.fill	f16=[r16],64
+	ldf.fill	f17=[r17],64
+	nop.i		0		}
+{ .mmi;	ldf.fill	f18=[r18],64
+	ldf.fill	f19=[r19],64
+	mov		pr=prevpr,0x1ffff	};;
+{ .mmi;	ldf.fill	f20=[r16]
+	ldf.fill	f21=[r17]
+	mov		ar.lc=prevlc	}
+{ .mmi;	ldf.fill	f22=[r18]
+	ldf.fill	f23=[r19]
+	mov		ret0=1		}	// signal "handled"
+{ .mib;	rum		1<<5
+	.restore	sp
+	mov		sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_8#
+
+.type	copyright#,\@object
+copyright:
+stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/ia64.S
+++ b/openssl-1.0.2f/crypto/bn/asm/ia64.S
--- a/openssl-1.0.2f/crypto/bn/asm/mips-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/mips-mont.pl
@@ -0,0 +1,426 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys, at least not on
+# in-order-execution cores. While 512-bit RSA sign operations can be
+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
+# verify:-( All comparisons are against bn_mul_mont-free assembler.
+# The module might be of interest to embedded system developers, as
+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
+# code.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+	$PTR_ADD="dadd";	# incidentally works even on n32
+	$PTR_SUB="dsub";	# incidentally works even on n32
+	$REG_S="sd";
+	$REG_L="ld";
+	$SZREG=8;
+} else {
+	$PTR_ADD="add";
+	$PTR_SUB="sub";
+	$REG_S="sw";
+	$REG_L="lw";
+	$SZREG=4;
+}
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+	$LD="ld";
+	$ST="sd";
+	$MULTU="dmultu";
+	$ADDU="daddu";
+	$SUBU="dsubu";
+	$BNSZ=8;
+} else {
+	$LD="lw";
+	$ST="sw";
+	$MULTU="multu";
+	$ADDU="addu";
+	$SUBU="subu";
+	$BNSZ=4;
+}
+
+# int bn_mul_mont(
+$rp=$a0;	# BN_ULONG *rp,
+$ap=$a1;	# const BN_ULONG *ap,
+$bp=$a2;	# const BN_ULONG *bp,
+$np=$a3;	# const BN_ULONG *np,
+$n0=$a4;	# const BN_ULONG *n0,
+$num=$a5;	# int num);
+
+$lo0=$a6;
+$hi0=$a7;
+$lo1=$t1;
+$hi1=$t2;
+$aj=$s0;
+$bi=$s1;
+$nj=$s2;
+$tp=$s3;
+$alo=$s4;
+$ahi=$s5;
+$nlo=$s6;
+$nhi=$s7;
+$tj=$s8;
+$i=$s9;
+$j=$s10;
+$m1=$s11;
+
+$FRAMESIZE=14;
+
+$code=<<___;
+.text
+
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	bn_mul_mont
+.ent	bn_mul_mont
+bn_mul_mont:
+___
+$code.=<<___ if ($flavour =~ /o32/i);
+	lw	$n0,16($sp)
+	lw	$num,20($sp)
+___
+$code.=<<___;
+	slt	$at,$num,4
+	bnez	$at,1f
+	li	$t0,0
+	slt	$at,$num,17	# on in-order CPU
+	bnez	$at,bn_mul_mont_internal
+	nop
+1:	jr	$ra
+	li	$a0,0
+.end	bn_mul_mont
+
+.align	5
+.ent	bn_mul_mont_internal
+bn_mul_mont_internal:
+	.frame	$fp,$FRAMESIZE*$SZREG,$ra
+	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
+	$PTR_SUB $sp,$FRAMESIZE*$SZREG
+	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
+	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+	move	$fp,$sp
+
+	.set	reorder
+	$LD	$n0,0($n0)
+	$LD	$bi,0($bp)	# bp[0]
+	$LD	$aj,0($ap)	# ap[0]
+	$LD	$nj,0($np)	# np[0]
+
+	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
+	sll	$num,`log($BNSZ)/log(2)`
+	li	$at,-4096
+	$PTR_SUB $sp,$num
+	and	$sp,$at
+
+	$MULTU	$aj,$bi
+	$LD	$alo,$BNSZ($ap)
+	$LD	$nlo,$BNSZ($np)
+	mflo	$lo0
+	mfhi	$hi0
+	$MULTU	$lo0,$n0
+	mflo	$m1
+
+	$MULTU	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	$MULTU	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+	$MULTU	$nlo,$m1
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,$sp
+	li	$j,2*$BNSZ
+.align	4
+.L1st:
+	.set	noreorder
+	$PTR_ADD $aj,$ap,$j
+	$PTR_ADD $nj,$np,$j
+	$LD	$aj,($aj)
+	$LD	$nj,($nj)
+
+	$MULTU	$aj,$bi
+	$ADDU	$lo0,$alo,$hi0
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo0,$hi0
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$hi1,$nhi,$t0
+	mflo	$alo
+	mfhi	$ahi
+
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$MULTU	$nj,$m1
+	$ADDU	$hi1,$at
+	addu	$j,$BNSZ
+	$ST	$lo1,($tp)
+	sltu	$t0,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+
+	bnez	$t0,.L1st
+	$PTR_ADD $tp,$BNSZ
+	.set	reorder
+
+	$ADDU	$lo0,$alo,$hi0
+	sltu	$at,$lo0,$hi0
+	$ADDU	$hi0,$ahi,$at
+
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi1,$nhi,$t0
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+
+	$ST	$lo1,($tp)
+
+	$ADDU	$hi1,$hi0
+	sltu	$at,$hi1,$hi0
+	$ST	$hi1,$BNSZ($tp)
+	$ST	$at,2*$BNSZ($tp)
+
+	li	$i,$BNSZ
+.align	4
+.Louter:
+	$PTR_ADD $bi,$bp,$i
+	$LD	$bi,($bi)
+	$LD	$aj,($ap)
+	$LD	$alo,$BNSZ($ap)
+	$LD	$tj,($sp)
+
+	$MULTU	$aj,$bi
+	$LD	$nj,($np)
+	$LD	$nlo,$BNSZ($np)
+	mflo	$lo0
+	mfhi	$hi0
+	$ADDU	$lo0,$tj
+	$MULTU	$lo0,$n0
+	sltu	$at,$lo0,$tj
+	$ADDU	$hi0,$at
+	mflo	$m1
+
+	$MULTU	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	$MULTU	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+
+	$MULTU	$nlo,$m1
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,$sp
+	li	$j,2*$BNSZ
+	$LD	$tj,$BNSZ($tp)
+.align	4
+.Linner:
+	.set	noreorder
+	$PTR_ADD $aj,$ap,$j
+	$PTR_ADD $nj,$np,$j
+	$LD	$aj,($aj)
+	$LD	$nj,($nj)
+
+	$MULTU	$aj,$bi
+	$ADDU	$lo0,$alo,$hi0
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo0,$hi0
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$hi1,$nhi,$t0
+	mflo	$alo
+	mfhi	$ahi
+
+	$ADDU	$lo0,$tj
+	addu	$j,$BNSZ
+	$MULTU	$nj,$m1
+	sltu	$at,$lo0,$tj
+	$ADDU	$lo1,$lo0
+	$ADDU	$hi0,$at
+	sltu	$t0,$lo1,$lo0
+	$LD	$tj,2*$BNSZ($tp)
+	$ADDU	$hi1,$t0
+	sltu	$at,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+	$ST	$lo1,($tp)
+	bnez	$at,.Linner
+	$PTR_ADD $tp,$BNSZ
+	.set	reorder
+
+	$ADDU	$lo0,$alo,$hi0
+	sltu	$at,$lo0,$hi0
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$lo0,$tj
+	sltu	$t0,$lo0,$tj
+	$ADDU	$hi0,$t0
+
+	$LD	$tj,2*$BNSZ($tp)
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo1,$hi1
+	$ADDU	$hi1,$nhi,$at
+	$ADDU	$lo1,$lo0
+	sltu	$t0,$lo1,$lo0
+	$ADDU	$hi1,$t0
+	$ST	$lo1,($tp)
+
+	$ADDU	$lo1,$hi1,$hi0
+	sltu	$hi1,$lo1,$hi0
+	$ADDU	$lo1,$tj
+	sltu	$at,$lo1,$tj
+	$ADDU	$hi1,$at
+	$ST	$lo1,$BNSZ($tp)
+	$ST	$hi1,2*$BNSZ($tp)
+
+	addu	$i,$BNSZ
+	sltu	$t0,$i,$num
+	bnez	$t0,.Louter
+
+	.set	noreorder
+	$PTR_ADD $tj,$sp,$num	# &tp[num]
+	move	$tp,$sp
+	move	$ap,$sp
+	li	$hi0,0		# clear borrow bit
+
+.align	4
+.Lsub:	$LD	$lo0,($tp)
+	$LD	$lo1,($np)
+	$PTR_ADD $tp,$BNSZ
+	$PTR_ADD $np,$BNSZ
+	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
+	sgtu	$at,$lo1,$lo0
+	$SUBU	$lo0,$lo1,$hi0
+	sgtu	$hi0,$lo0,$lo1
+	$ST	$lo0,($rp)
+	or	$hi0,$at
+	sltu	$at,$tp,$tj
+	bnez	$at,.Lsub
+	$PTR_ADD $rp,$BNSZ
+
+	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
+	move	$tp,$sp
+	$PTR_SUB $rp,$num	# restore rp
+	not	$hi1,$hi0
+
+	and	$ap,$hi0,$sp
+	and	$bp,$hi1,$rp
+	or	$ap,$ap,$bp	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	$LD	$aj,($ap)
+	$PTR_ADD $ap,$BNSZ
+	$ST	$zero,($tp)
+	$PTR_ADD $tp,$BNSZ
+	sltu	$at,$tp,$tj
+	$ST	$aj,($rp)
+	bnez	$at,.Lcopy
+	$PTR_ADD $rp,$BNSZ
+
+	li	$a0,1
+	li	$t0,1
+
+	.set	noreorder
+	move	$sp,$fp
+	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
+	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end	bn_mul_mont_internal
+.rdata
+.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/mips.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/mips.pl
--- a/openssl-1.0.2f/crypto/bn/asm/mips3-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/mips3-mont.pl
@@ -0,0 +1,327 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys. While 512-bit
+# RSA private key operations are 40% faster, 1024-bit ones are hardly
+# faster at all, while longer key operations are slower by up to 20%.
+# It might be of interest to embedded system developers though, as
+# it's smaller than 1KB, yet offers ~3x improvement over compiler
+# generated code.
+#
+# The module targets N32 and N64 MIPS ABIs and currently is a bit
+# IRIX-centric, i.e. is likely to require adaptation for other OSes.
+
+# int bn_mul_mont(
+$rp="a0";	# BN_ULONG *rp,
+$ap="a1";	# const BN_ULONG *ap,
+$bp="a2";	# const BN_ULONG *bp,
+$np="a3";	# const BN_ULONG *np,
+$n0="a4";	# const BN_ULONG *n0,
+$num="a5";	# int num);
+
+$lo0="a6";
+$hi0="a7";
+$lo1="v0";
+$hi1="v1";
+$aj="t0";
+$bi="t1";
+$nj="t2";
+$tp="t3";
+$alo="s0";
+$ahi="s1";
+$nlo="s2";
+$nhi="s3";
+$tj="s4";
+$i="s5";
+$j="s6";
+$fp="t8";
+$m1="t9";
+
+$FRAME=8*(2+8);
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set	noat
+.set	reorder
+
+.align	5
+.globl	bn_mul_mont
+.ent	bn_mul_mont
+bn_mul_mont:
+	.set	noreorder
+	PTR_SUB	sp,64
+	move	$fp,sp
+	.frame	$fp,64,ra
+	slt	AT,$num,4
+	li	v0,0
+	beqzl	AT,.Lproceed
+	nop
+	jr	ra
+	PTR_ADD	sp,$fp,64
+	.set	reorder
+.align	5
+.Lproceed:
+	ld	$n0,0($n0)
+	ld	$bi,0($bp)	# bp[0]
+	ld	$aj,0($ap)	# ap[0]
+	ld	$nj,0($np)	# np[0]
+	PTR_SUB	sp,16		# place for two extra words
+	sll	$num,3
+	li	AT,-4096
+	PTR_SUB	sp,$num
+	and	sp,AT
+
+	sd	s0,0($fp)
+	sd	s1,8($fp)
+	sd	s2,16($fp)
+	sd	s3,24($fp)
+	sd	s4,32($fp)
+	sd	s5,40($fp)
+	sd	s6,48($fp)
+	sd	s7,56($fp)
+
+	dmultu	$aj,$bi
+	ld	$alo,8($ap)
+	ld	$nlo,8($np)
+	mflo	$lo0
+	mfhi	$hi0
+	dmultu	$lo0,$n0
+	mflo	$m1
+
+	dmultu	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	dmultu	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+	dmultu	$nlo,$m1
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	daddu	$hi1,AT
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,sp
+	li	$j,16
+.align	4
+.L1st:
+	.set	noreorder
+	PTR_ADD	$aj,$ap,$j
+	ld	$aj,($aj)
+	PTR_ADD	$nj,$np,$j
+	ld	$nj,($nj)
+
+	dmultu	$aj,$bi
+	daddu	$lo0,$alo,$hi0
+	daddu	$lo1,$nlo,$hi1
+	sltu	AT,$lo0,$hi0
+	sltu	s7,$lo1,$hi1
+	daddu	$hi0,$ahi,AT
+	daddu	$hi1,$nhi,s7
+	mflo	$alo
+	mfhi	$ahi
+
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	dmultu	$nj,$m1
+	daddu	$hi1,AT
+	addu	$j,8
+	sd	$lo1,($tp)
+	sltu	s7,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+
+	bnez	s7,.L1st
+	PTR_ADD	$tp,8
+	.set	reorder
+
+	daddu	$lo0,$alo,$hi0
+	sltu	AT,$lo0,$hi0
+	daddu	$hi0,$ahi,AT
+
+	daddu	$lo1,$nlo,$hi1
+	sltu	s7,$lo1,$hi1
+	daddu	$hi1,$nhi,s7
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	daddu	$hi1,AT
+
+	sd	$lo1,($tp)
+
+	daddu	$hi1,$hi0
+	sltu	AT,$hi1,$hi0
+	sd	$hi1,8($tp)
+	sd	AT,16($tp)
+
+	li	$i,8
+.align	4
+.Louter:
+	PTR_ADD	$bi,$bp,$i
+	ld	$bi,($bi)
+	ld	$aj,($ap)
+	ld	$alo,8($ap)
+	ld	$tj,(sp)
+
+	dmultu	$aj,$bi
+	ld	$nj,($np)
+	ld	$nlo,8($np)
+	mflo	$lo0
+	mfhi	$hi0
+	daddu	$lo0,$tj
+	dmultu	$lo0,$n0
+	sltu	AT,$lo0,$tj
+	daddu	$hi0,AT
+	mflo	$m1
+
+	dmultu	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	dmultu	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+
+	dmultu	$nlo,$m1
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	daddu	$hi1,AT
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,sp
+	li	$j,16
+	ld	$tj,8($tp)
+.align	4
+.Linner:
+	.set	noreorder
+	PTR_ADD	$aj,$ap,$j
+	ld	$aj,($aj)
+	PTR_ADD	$nj,$np,$j
+	ld	$nj,($nj)
+
+	dmultu	$aj,$bi
+	daddu	$lo0,$alo,$hi0
+	daddu	$lo1,$nlo,$hi1
+	sltu	AT,$lo0,$hi0
+	sltu	s7,$lo1,$hi1
+	daddu	$hi0,$ahi,AT
+	daddu	$hi1,$nhi,s7
+	mflo	$alo
+	mfhi	$ahi
+
+	daddu	$lo0,$tj
+	addu	$j,8
+	dmultu	$nj,$m1
+	sltu	AT,$lo0,$tj
+	daddu	$lo1,$lo0
+	daddu	$hi0,AT
+	sltu	s7,$lo1,$lo0
+	ld	$tj,16($tp)
+	daddu	$hi1,s7
+	sltu	AT,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+	sd	$lo1,($tp)
+	bnez	AT,.Linner
+	PTR_ADD	$tp,8
+	.set	reorder
+
+	daddu	$lo0,$alo,$hi0
+	sltu	AT,$lo0,$hi0
+	daddu	$hi0,$ahi,AT
+	daddu	$lo0,$tj
+	sltu	s7,$lo0,$tj
+	daddu	$hi0,s7
+
+	ld	$tj,16($tp)
+	daddu	$lo1,$nlo,$hi1
+	sltu	AT,$lo1,$hi1
+	daddu	$hi1,$nhi,AT
+	daddu	$lo1,$lo0
+	sltu	s7,$lo1,$lo0
+	daddu	$hi1,s7
+	sd	$lo1,($tp)
+
+	daddu	$lo1,$hi1,$hi0
+	sltu	$hi1,$lo1,$hi0
+	daddu	$lo1,$tj
+	sltu	AT,$lo1,$tj
+	daddu	$hi1,AT
+	sd	$lo1,8($tp)
+	sd	$hi1,16($tp)
+
+	addu	$i,8
+	sltu	s7,$i,$num
+	bnez	s7,.Louter
+
+	.set	noreorder
+	PTR_ADD	$tj,sp,$num	# &tp[num]
+	move	$tp,sp
+	move	$ap,sp
+	li	$hi0,0		# clear borrow bit
+
+.align	4
+.Lsub:	ld	$lo0,($tp)
+	ld	$lo1,($np)
+	PTR_ADD	$tp,8
+	PTR_ADD	$np,8
+	dsubu	$lo1,$lo0,$lo1	# tp[i]-np[i]
+	sgtu	AT,$lo1,$lo0
+	dsubu	$lo0,$lo1,$hi0
+	sgtu	$hi0,$lo0,$lo1
+	sd	$lo0,($rp)
+	or	$hi0,AT
+	sltu	AT,$tp,$tj
+	bnez	AT,.Lsub
+	PTR_ADD	$rp,8
+
+	dsubu	$hi0,$hi1,$hi0	# handle upmost overflow bit
+	move	$tp,sp
+	PTR_SUB	$rp,$num	# restore rp
+	not	$hi1,$hi0
+
+	and	$ap,$hi0,sp
+	and	$bp,$hi1,$rp
+	or	$ap,$ap,$bp	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	ld	$aj,($ap)
+	PTR_ADD	$ap,8
+	PTR_ADD	$tp,8
+	sd	zero,-8($tp)
+	sltu	AT,$tp,$tj
+	sd	$aj,($rp)
+	bnez	AT,.Lcopy
+	PTR_ADD	$rp,8
+
+	ld	s0,0($fp)
+	ld	s1,8($fp)
+	ld	s2,16($fp)
+	ld	s3,24($fp)
+	ld	s4,32($fp)
+	ld	s5,40($fp)
+	ld	s6,48($fp)
+	ld	s7,56($fp)
+	li	v0,1
+	jr	ra
+	PTR_ADD	sp,$fp,64
+	.set	reorder
+END(bn_mul_mont)
+.rdata
+.asciiz	"Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/mips3.s
+++ b/openssl-1.0.2f/crypto/bn/asm/mips3.s
--- a/openssl-1.0.2f/crypto/bn/asm/pa-risc2.s
+++ b/openssl-1.0.2f/crypto/bn/asm/pa-risc2.s
--- a/openssl-1.0.2f/crypto/bn/asm/pa-risc2W.s
+++ b/openssl-1.0.2f/crypto/bn/asm/pa-risc2W.s
--- a/openssl-1.0.2f/crypto/bn/asm/parisc-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,995 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$BN_SZ		=$SIZE_T;
+} else {
+	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$BN_SZ		=$SIZE_T;
+	if (open CONF,"<${dir}../../opensslconf.h") {
+	    while(<CONF>) {
+		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+		    $BN_SZ=8;
+		    $LEVEL="2.0";
+		    last;
+		}
+	    }
+	    close CONF;
+	}
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
+				#                [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32;			# local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22";	# passed through stack in 32-bit
+$num="%r21";	# passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4";	$fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
+$fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+bn_mul_mont
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	ldo	-$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldw	`-$FRAME_MARKER-4`($fp),$n0
+	ldw	`-$FRAME_MARKER-8`($fp),$num
+	nop
+	nop					; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+	comiclr,<=	6,$num,%r0		; are vectors long enough?
+	b		L\$abort
+	ldi		0,%r28			; signal "unhandled"
+	add,ev		%r0,$num,$num		; is $num even?
+	b		L\$abort
+	nop
+	or		$ap,$np,$ti1
+	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
+	b		L\$abort
+	nop
+	nop					; alignment
+	nop
+
+	fldws		0($n0),${fn0}
+	fldws,ma	4($bp),${fbi}		; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+	comib,>		3,$num,L\$abort		; are vectors long enough?
+	ldi		0,%r28			; signal "unhandled"
+	addl		$num,$num,$num		; I operate on 32-bit values
+
+	fldws		4($n0),${fn0}		; only low part of n0
+	fldws		4($bp),${fbi}		; bp[0] in flipped word order
+___
+$code.=<<___;
+	fldds		0($ap),${fai}		; ap[0,1]
+	fldds		0($np),${fni}		; np[0,1]
+
+	sh2addl		$num,%r0,$arrsz
+	ldi		31,$hi0
+	ldo		36($arrsz),$hi1		; space for tp[num+1]
+	andcm		$hi1,$hi0,$hi1		; align
+	addl		$hi1,%sp,%sp
+	$PUSH		$fp,-$SIZE_T(%sp)
+
+	ldo		`$LOCALS+16`($fp),$xfer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
+	xmpyu		${fn0},${fab0}R,${fm0}
+
+	addl		$arrsz,$ap,$ap		; point at the end
+	addl		$arrsz,$np,$np
+	subi		0,$arrsz,$idx		; j=0
+	ldo		8($idx),$idx		; j++++
+
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	fstds		${fab1},0($xfer)
+	fstds		${fnm1},8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+	mtctl		$hi0,%cr11		; $hi0 still holds 31
+	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
+	b		L\$parisc11
+	nop
+___
+$code.=<<___;					# PA-RISC 2.0 code-path
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+
+	extrd,u		$ab0,31,32,$hi0
+	extrd,u		$ab0,63,32,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 ldo		8($idx),$idx		; j++++
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	 extrd,u	$nm0,31,32,$hi1
+
+L\$1st
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$ab1,$nm1,$nm1
+	 extrd,u	$nm1,31,32,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	 addl		$ab1,$nm1,$nm1
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+	fldws		0($bp),${fbi}		; bp[1] in flipped word order
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	addl		$hi1,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2]
+	 flddx		$idx($np),${fni}	; np[2]
+	 ldo		8($idx),$idx		; j++++
+	ldd		-16($xfer),$ab0		; 33-bit value
+	ldd		-8($xfer),$nm0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	 extrd,u	$ab0,31,32,$ti0		; carry bit
+	 extrd,u	$ab0,63,32,$ab0
+	fstds		${fab1},0($xfer)
+	 addl		$ti0,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 extrd,u	$nm0,31,32,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+L\$inner
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldw		4($tp),$ti0		; tp[j]
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ti0,$ti0
+	 addl		$ti0,$ab0,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,31,32,$hi0
+	 extrd,u	$nm1,31,32,$hi1
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	ldw		4($tp),$ti0		; tp[j]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	addl		$hi0,$ab0,$ab0
+	 addl		$ti0,$ab0,$ab0
+	 stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,31,32,$hi0
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone	; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldi		12,$ti0			; bp[i] in flipped word order
+	addl,ev		%r0,$num,$num
+	ldi		-4,$ti0
+	addl		$ti0,$bp,$bp
+	fldws		0($bp),${fbi}
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0]
+	addl		$hi0,$ab1,$ab1
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	addl		$hi1,$hi0,$hi0
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+	addl		$hi0,$ab1,$ab1
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	addl		$hi1,$hi0,$hi0
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+	ldws,ma		4($tp),$ti0
+	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
+	b		L\$sub_pa11
+	addl		$tp,$arrsz,$tp
+L\$sub
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldd,ma		8($tp),$ti0
+L\$sub
+	ldd		$idx($np),$hi0
+	shrpd		$ti0,$ti0,32,$ti0	; flip word order
+	std		$ti0,-8($tp)		; save flipped value
+	sub,db		$ti0,$hi0,$hi1
+	ldd,ma		8($tp),$ti0
+	addib,<>	8,$idx,L\$sub
+	std,ma		$hi1,8($rp)
+
+	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
+	sub,db		$ti0,%r0,$hi1
+	ldo		-8($tp),$tp
+___
+$code.=<<___;
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy
+	ldd		$idx($np),$hi0
+	std,ma		%r0,8($tp)
+	addib,<>	8,$idx,.-8		; L\$copy
+	std,ma		$hi0,8($rp)	
+___
+
+if ($BN_SZ==4) {				# PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+	b		L\$done
+	nop
+
+	.ALIGN		8
+L\$parisc11
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-12($xfer),$ablo
+	ldw		-16($xfer),$hi0
+	ldw		-4($xfer),$nmlo0
+	ldw		-8($xfer),$nmhi0
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+	 ldo		8($idx),$idx		; j++++
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	 addc		%r0,$nmhi0,$hi1
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+	nop
+
+L\$1st_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		12($xfer),$nmlo1
+	 addc		%r0,$abhi,$hi0
+	ldw		8($xfer),$nmhi1
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	fstds		${fnm1},8($xfer)
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-16($xfer),$abhi
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	ldw		-4($xfer),$nmlo0
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-8($xfer),$nmhi0
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab0},-16($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	fstds		${fnm0},-8($xfer)
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	 ldw		8($xfer),$nmhi1
+	 ldw		12($xfer),$nmlo1
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	 add		$hi0,$ablo,$ablo
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-4($xfer),$nmlo0
+
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	stws,ma		$nmlo0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[1]
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+	ldw		-16($xfer),$abhi	; carry bit actually
+	 ldo		8($idx),$idx		; j++++
+	ldw		-12($xfer),$ablo
+	ldw		-8($xfer),$nmhi0
+	ldw		-4($xfer),$nmlo0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	fstds		${fab1},0($xfer)
+	 addl		$abhi,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 addc		%r0,$nmhi0,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+
+L\$inner_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	ldw		12($xfer),$nmlo1
+	 add		$ti1,$ablo,$ablo
+	ldw		8($xfer),$nmhi1
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab1},0($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	ldw		8($tp),$ti1		; tp[j]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-4($xfer),$nmlo0
+	 add		$hi0,$ablo,$ablo
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$abhi,$abhi
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 add		$ti0,$ablo,$ablo
+	fstds		${fab0},-16($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm0},-8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	ldw		12($xfer),$nmlo1
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldw		8($xfer),$nmhi1
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	fstds		${fab1},0($xfer)
+	 add		$ti1,$ablo,$ablo
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$abhi,$hi0
+	ldw		-16($xfer),$abhi
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-8($xfer),$nmhi0
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-4($xfer),$nmlo0
+	 addc		%r0,$nmhi1,$hi1
+
+	add		$hi0,$ablo,$ablo
+	 stw		$nmlo1,-4($tp)		; tp[j-1]
+	addc		%r0,$abhi,$abhi
+	 add		$ti0,$ablo,$ablo
+	ldw		8($tp),$ti1		; tp[j]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone_pa11; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[i]
+	 flddx		$idx($ap),${fai}	; ap[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer_pa11
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32+4`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+	ldw		-4($tp),$ti0
+	addl		$tp,$arrsz,$tp
+L\$sub_pa11
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub_pa11
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy_pa11
+	ldwx		$idx($np),$hi0
+	stws,ma		%r0,4($tp)
+	addib,<>	4,$idx,L\$copy_pa11
+	stws,ma		$hi0,4($rp)	
+
+	nop					; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+	ldi		1,%r28			; signal "handled"
+	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
+    {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $sub = sub {
+  my ($mod,$args) = @_;
+  my $orig = "sub$mod\t$args";
+
+    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+	$opcode|=(1<<10);	# e1
+	$opcode|=(1<<8);	# e2
+	$opcode|=(1<<5);	# d
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	# flip word order in 64-bit mode...
+	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+	# assemble 2.0 instructions in 32-bit mode...
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+	s/\bbv\b/bve/gm	if ($SIZE_T==8);
+
+	print $_,"\n";
+}
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/ppc-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/ppc-mont.pl
@@ -0,0 +1,335 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2006
+
+# "Teaser" Montgomery multiplication module for PowerPC. It's possible
+# to gain a bit more by modulo-scheduling outer loop, then dedicated
+# squaring procedure should give further 20% and code can be adapted
+# for 32-bit application running on 64-bit CPU. As for the latter.
+# It won't be able to achieve "native" 64-bit performance, because in
+# 32-bit application context every addc instruction will have to be
+# expanded as addc, twice right shift by 32 and finally adde, etc.
+# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
+# for 64-bit application running on PPC970/G5 is:
+#
+# 512-bit	+65%	
+# 1024-bit	+35%
+# 2048-bit	+18%
+# 4096-bit	+4%
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+	$BITS=	32;
+	$BNSZ=	$BITS/8;
+	$SIZE_T=4;
+	$RZONE=	224;
+
+	$LD=	"lwz";		# load
+	$LDU=	"lwzu";		# load and update
+	$LDX=	"lwzx";		# load indexed
+	$ST=	"stw";		# store
+	$STU=	"stwu";		# store and update
+	$STX=	"stwx";		# store indexed
+	$STUX=	"stwux";	# store indexed and update
+	$UMULL=	"mullw";	# unsigned multiply low
+	$UMULH=	"mulhwu";	# unsigned multiply high
+	$UCMP=	"cmplw";	# unsigned compare
+	$SHRI=	"srwi";		# unsigned shift right by immediate	
+	$PUSH=	$ST;
+	$POP=	$LD;
+} elsif ($flavour =~ /64/) {
+	$BITS=	64;
+	$BNSZ=	$BITS/8;
+	$SIZE_T=8;
+	$RZONE=	288;
+
+	# same as above, but 64-bit mnemonics...
+	$LD=	"ld";		# load
+	$LDU=	"ldu";		# load and update
+	$LDX=	"ldx";		# load indexed
+	$ST=	"std";		# store
+	$STU=	"stdu";		# store and update
+	$STX=	"stdx";		# store indexed
+	$STUX=	"stdux";	# store indexed and update
+	$UMULL=	"mulld";	# unsigned multiply low
+	$UMULH=	"mulhdu";	# unsigned multiply high
+	$UCMP=	"cmpld";	# unsigned compare
+	$SHRI=	"srdi";		# unsigned shift right by immediate	
+	$PUSH=	$ST;
+	$POP=	$LD;
+} else { die "nonsense $flavour"; }
+
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$sp="r1";
+$toc="r2";
+$rp="r3";	$ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9";	# $rp is reassigned
+$aj="r10";
+$nj="r11";
+$tj="r12";
+# non-volatile registers
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
+#
+$nhi="r0";
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl	.bn_mul_mont_int
+.align	4
+.bn_mul_mont_int:
+	cmpwi	$num,4
+	mr	$rp,r3		; $rp is reassigned
+	li	r3,0
+	bltlr
+___
+$code.=<<___ if ($BNSZ==4);
+	cmpwi	$num,32		; longer key performance is not better
+	bgelr
+___
+$code.=<<___;
+	slwi	$num,$num,`log($BNSZ)/log(2)`
+	li	$tj,-4096
+	addi	$ovf,$num,$FRAME
+	subf	$ovf,$ovf,$sp	; $sp-$ovf
+	and	$ovf,$ovf,$tj	; minimize TLB usage
+	subf	$ovf,$sp,$ovf	; $ovf-$sp
+	mr	$tj,$sp
+	srwi	$num,$num,`log($BNSZ)/log(2)`
+	$STUX	$sp,$sp,$ovf
+
+	$PUSH	r20,`-12*$SIZE_T`($tj)
+	$PUSH	r21,`-11*$SIZE_T`($tj)
+	$PUSH	r22,`-10*$SIZE_T`($tj)
+	$PUSH	r23,`-9*$SIZE_T`($tj)
+	$PUSH	r24,`-8*$SIZE_T`($tj)
+	$PUSH	r25,`-7*$SIZE_T`($tj)
+	$PUSH	r26,`-6*$SIZE_T`($tj)
+	$PUSH	r27,`-5*$SIZE_T`($tj)
+	$PUSH	r28,`-4*$SIZE_T`($tj)
+	$PUSH	r29,`-3*$SIZE_T`($tj)
+	$PUSH	r30,`-2*$SIZE_T`($tj)
+	$PUSH	r31,`-1*$SIZE_T`($tj)
+
+	$LD	$n0,0($n0)	; pull n0[0] value
+	addi	$num,$num,-2	; adjust $num for counter register
+
+	$LD	$m0,0($bp)	; m0=bp[0]
+	$LD	$aj,0($ap)	; ap[0]
+	addi	$tp,$sp,$LOCALS
+	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
+	$UMULH	$hi0,$aj,$m0
+
+	$LD	$aj,$BNSZ($ap)	; ap[1]
+	$LD	$nj,0($np)	; np[0]
+
+	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
+
+	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
+	$UMULH	$ahi,$aj,$m0
+
+	$UMULL	$lo1,$nj,$m1	; np[0]*m1
+	$UMULH	$hi1,$nj,$m1
+	$LD	$nj,$BNSZ($np)	; np[1]
+	addc	$lo1,$lo1,$lo0
+	addze	$hi1,$hi1
+
+	$UMULL	$nlo,$nj,$m1	; np[1]*m1
+	$UMULH	$nhi,$nj,$m1
+
+	mtctr	$num
+	li	$j,`2*$BNSZ`
+.align	4
+L1st:
+	$LDX	$aj,$ap,$j	; ap[j]
+	addc	$lo0,$alo,$hi0
+	$LDX	$nj,$np,$j	; np[j]
+	addze	$hi0,$ahi
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
+	addc	$lo1,$nlo,$hi1
+	$UMULH	$ahi,$aj,$m0
+	addze	$hi1,$nhi
+	$UMULL	$nlo,$nj,$m1	; np[j]*m1
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
+	$UMULH	$nhi,$nj,$m1
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	addi	$j,$j,$BNSZ	; j++
+	addi	$tp,$tp,$BNSZ	; tp++
+	bdnz-	L1st
+;L1st
+	addc	$lo0,$alo,$hi0
+	addze	$hi0,$ahi
+
+	addc	$lo1,$nlo,$hi1
+	addze	$hi1,$nhi
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	li	$ovf,0
+	addc	$hi1,$hi1,$hi0
+	addze	$ovf,$ovf	; upmost overflow bit
+	$ST	$hi1,$BNSZ($tp)
+
+	li	$i,$BNSZ
+.align	4
+Louter:
+	$LDX	$m0,$bp,$i	; m0=bp[i]
+	$LD	$aj,0($ap)	; ap[0]
+	addi	$tp,$sp,$LOCALS
+	$LD	$tj,$LOCALS($sp); tp[0]
+	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
+	$UMULH	$hi0,$aj,$m0
+	$LD	$aj,$BNSZ($ap)	; ap[1]
+	$LD	$nj,0($np)	; np[0]
+	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
+	addze	$hi0,$hi0
+	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
+	$UMULH	$ahi,$aj,$m0
+	$UMULL	$lo1,$nj,$m1	; np[0]*m1
+	$UMULH	$hi1,$nj,$m1
+	$LD	$nj,$BNSZ($np)	; np[1]
+	addc	$lo1,$lo1,$lo0
+	$UMULL	$nlo,$nj,$m1	; np[1]*m1
+	addze	$hi1,$hi1
+	$UMULH	$nhi,$nj,$m1
+
+	mtctr	$num
+	li	$j,`2*$BNSZ`
+.align	4
+Linner:
+	$LDX	$aj,$ap,$j	; ap[j]
+	addc	$lo0,$alo,$hi0
+	$LD	$tj,$BNSZ($tp)	; tp[j]
+	addze	$hi0,$ahi
+	$LDX	$nj,$np,$j	; np[j]
+	addc	$lo1,$nlo,$hi1
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
+	addze	$hi1,$nhi
+	$UMULH	$ahi,$aj,$m0
+	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
+	$UMULL	$nlo,$nj,$m1	; np[j]*m1
+	addze	$hi0,$hi0
+	$UMULH	$nhi,$nj,$m1
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
+	addi	$j,$j,$BNSZ	; j++
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+	addi	$tp,$tp,$BNSZ	; tp++
+	bdnz-	Linner
+;Linner
+	$LD	$tj,$BNSZ($tp)	; tp[j]
+	addc	$lo0,$alo,$hi0
+	addze	$hi0,$ahi
+	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
+	addze	$hi0,$hi0
+
+	addc	$lo1,$nlo,$hi1
+	addze	$hi1,$nhi
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
+	li	$ovf,0
+	adde	$hi1,$hi1,$hi0
+	addze	$ovf,$ovf
+	$ST	$hi1,$BNSZ($tp)
+;
+	slwi	$tj,$num,`log($BNSZ)/log(2)`
+	$UCMP	$i,$tj
+	addi	$i,$i,$BNSZ
+	ble-	Louter
+
+	addi	$num,$num,2	; restore $num
+	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,$LOCALS
+	mtctr	$num
+
+.align	4
+Lsub:	$LDX	$tj,$tp,$j
+	$LDX	$nj,$np,$j
+	subfe	$aj,$nj,$tj	; tp[j]-np[j]
+	$STX	$aj,$rp,$j
+	addi	$j,$j,$BNSZ
+	bdnz-	Lsub
+
+	li	$j,0
+	mtctr	$num
+	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	$LDX	$tj,$ap,$j
+	$STX	$tj,$rp,$j
+	$STX	$j,$tp,$j	; zap at once
+	addi	$j,$j,$BNSZ
+	bdnz-	Lcopy
+
+	$POP	$tj,0($sp)
+	li	r3,1
+	$POP	r20,`-12*$SIZE_T`($tj)
+	$POP	r21,`-11*$SIZE_T`($tj)
+	$POP	r22,`-10*$SIZE_T`($tj)
+	$POP	r23,`-9*$SIZE_T`($tj)
+	$POP	r24,`-8*$SIZE_T`($tj)
+	$POP	r25,`-7*$SIZE_T`($tj)
+	$POP	r26,`-6*$SIZE_T`($tj)
+	$POP	r27,`-5*$SIZE_T`($tj)
+	$POP	r28,`-4*$SIZE_T`($tj)
+	$POP	r29,`-3*$SIZE_T`($tj)
+	$POP	r30,`-2*$SIZE_T`($tj)
+	$POP	r31,`-1*$SIZE_T`($tj)
+	mr	$sp,$tj
+	blr
+	.long	0
+	.byte	0,12,4,0,0x80,12,6,0
+	.long	0
+.size	.bn_mul_mont_int,.-.bn_mul_mont_int
+
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/ppc.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/ppc.pl
--- a/openssl-1.0.2f/crypto/bn/asm/ppc64-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/ppc64-mont.pl
--- a/openssl-1.0.2f/crypto/bn/asm/rsaz-avx2.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/rsaz-avx2.pl
--- a/openssl-1.0.2f/crypto/bn/asm/rsaz-x86_64.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/rsaz-x86_64.pl
--- a/openssl-1.0.2f/crypto/bn/asm/s390x-gf2m.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/s390x-gf2m.pl
@@ -0,0 +1,221 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... gcc 4.3 appeared to generate poor code, therefore
+# the effort. And indeed, the module delivers 55%-90%(*) improvement
+# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
+# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
+# This is for 64-bit build. In 32-bit "highgprs" case improvement is
+# even higher, for example on z990 it was measured 80%-150%. ECDSA
+# sign is modest 9%-12% faster. Keep in mind that these coefficients
+# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
+# burnt in it...
+#
+# (*)	gcc 4.1 was observed to deliver better results than gcc 4.3,
+#	so that improvement coefficients can vary from one specific
+#	setup to another.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
+$rp="%r2";
+$a1="%r3";
+$a0="%r4";
+$b1="%r5";
+$b0="%r6";
+
+$ra="%r14";
+$sp="%r15";
+
+@T=("%r0","%r1");
+@i=("%r12","%r13");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
+($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@function
+.align	16
+_mul_1x1:
+	lgr	$a1,$a
+	sllg	$a2,$a,1
+	sllg	$a4,$a,2
+	sllg	$a8,$a,3
+
+	srag	$lo,$a1,63			# broadcast 63rd bit
+	nihh	$a1,0x1fff
+	srag	@i[0],$a2,63			# broadcast 62nd bit
+	nihh	$a2,0x3fff
+	srag	@i[1],$a4,63			# broadcast 61st bit
+	nihh	$a4,0x7fff
+	ngr	$lo,$b
+	ngr	@i[0],$b
+	ngr	@i[1],$b
+
+	lghi	@T[0],0
+	lgr	$a12,$a1
+	stg	@T[0],`$stdframe+0*8`($sp)	# tab[0]=0
+	xgr	$a12,$a2
+	stg	$a1,`$stdframe+1*8`($sp)	# tab[1]=a1
+	 lgr	$a48,$a4
+	stg	$a2,`$stdframe+2*8`($sp)	# tab[2]=a2
+	 xgr	$a48,$a8
+	stg	$a12,`$stdframe+3*8`($sp)	# tab[3]=a1^a2
+	 xgr	$a1,$a4
+
+	stg	$a4,`$stdframe+4*8`($sp)	# tab[4]=a4
+	xgr	$a2,$a4
+	stg	$a1,`$stdframe+5*8`($sp)	# tab[5]=a1^a4
+	xgr	$a12,$a4
+	stg	$a2,`$stdframe+6*8`($sp)	# tab[6]=a2^a4
+	 xgr	$a1,$a48
+	stg	$a12,`$stdframe+7*8`($sp)	# tab[7]=a1^a2^a4
+	 xgr	$a2,$a48
+
+	stg	$a8,`$stdframe+8*8`($sp)	# tab[8]=a8
+	xgr	$a12,$a48
+	stg	$a1,`$stdframe+9*8`($sp)	# tab[9]=a1^a8
+	 xgr	$a1,$a4
+	stg	$a2,`$stdframe+10*8`($sp)	# tab[10]=a2^a8
+	 xgr	$a2,$a4
+	stg	$a12,`$stdframe+11*8`($sp)	# tab[11]=a1^a2^a8
+
+	xgr	$a12,$a4
+	stg	$a48,`$stdframe+12*8`($sp)	# tab[12]=a4^a8
+	 srlg	$hi,$lo,1
+	stg	$a1,`$stdframe+13*8`($sp)	# tab[13]=a1^a4^a8
+	 sllg	$lo,$lo,63
+	stg	$a2,`$stdframe+14*8`($sp)	# tab[14]=a2^a4^a8
+	 srlg	@T[0],@i[0],2
+	stg	$a12,`$stdframe+15*8`($sp)	# tab[15]=a1^a2^a4^a8
+
+	lghi	$mask,`0xf<<3`
+	sllg	$a1,@i[0],62
+	 sllg	@i[0],$b,3
+	srlg	@T[1],@i[1],3
+	 ngr	@i[0],$mask
+	sllg	$a2,@i[1],61
+	 srlg	@i[1],$b,4-3
+	xgr	$hi,@T[0]
+	 ngr	@i[1],$mask
+	xgr	$lo,$a1
+	xgr	$hi,@T[1]
+	xgr	$lo,$a2
+
+	xg	$lo,$stdframe(@i[0],$sp)
+	srlg	@i[0],$b,8-3
+	ngr	@i[0],$mask
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	srlg	@i[1],$b,`($n+2)*4`-3
+	sllg	@T[0],@T[1],`$n*4`
+	ngr	@i[1],$mask
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+___
+	push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	sllg	@T[0],@T[1],`$n*4`
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+
+	lg	@T[0],$stdframe(@i[0],$sp)
+	sllg	@T[1],@T[0],`($n+1)*4`
+	srlg	@T[0],@T[0],`64-($n+1)*4`
+	xgr	$lo,@T[1]
+	xgr	$hi,@T[0]
+
+	br	$ra
+.size	_mul_1x1,.-_mul_1x1
+
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@function
+.align	16
+bn_GF2m_mul_2x2:
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
+
+	lghi	%r1,-$stdframe-128
+	la	%r0,0($sp)
+	la	$sp,0(%r1,$sp)			# alloca
+	st${g}	%r0,0($sp)			# back chain
+___
+if ($SIZE_T==8) {
+my @r=map("%r$_",(6..9));
+$code.=<<___;
+	bras	$ra,_mul_1x1			# a1·b1
+	stmg	$lo,$hi,16($rp)
+
+	lg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# a0·b0
+	stmg	$lo,$hi,0($rp)
+
+	lg	$a,`$stdframe+128+3*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+5*$SIZE_T`($sp)
+	xg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	xg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# (a0+a1)·(b0+b1)
+	lmg	@r[0],@r[3],0($rp)
+
+	xgr	$lo,$hi
+	xgr	$hi,@r[1]
+	xgr	$lo,@r[0]
+	xgr	$hi,@r[2]
+	xgr	$lo,@r[3]	
+	xgr	$hi,@r[3]
+	xgr	$lo,$hi
+	stg	$hi,16($rp)
+	stg	$lo,8($rp)
+___
+} else {
+$code.=<<___;
+	sllg	%r3,%r3,32
+	sllg	%r5,%r5,32
+	or	%r3,%r4
+	or	%r5,%r6
+	bras	$ra,_mul_1x1
+	rllg	$lo,$lo,32
+	rllg	$hi,$hi,32
+	stmg	$lo,$hi,0($rp)
+___
+}
+$code.=<<___;
+	lm${g}	%r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
+	br	$ra
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.string	"GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/s390x-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/s390x-mont.pl
@@ -0,0 +1,277 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2007.
+#
+# Performance improvement over vanilla C code varies from 85% to 45%
+# depending on key length and benchmark. Unfortunately in this context
+# these are not very impressive results [for code that utilizes "wide"
+# 64x64=128-bit multiplication, which is not commonly available to C
+# programmers], at least hand-coded bn_asm.c replacement is known to
+# provide 30-40% better results for longest keys. Well, on a second
+# thought it's not very surprising, because z-CPUs are single-issue
+# and _strictly_ in-order execution, while bn_mul_mont is more or less
+# dependent on CPU ability to pipe-line instructions and have several
+# of them "in-flight" at the same time. I mean while other methods,
+# for example Karatsuba, aim to minimize amount of multiplications at
+# the cost of other operations increase, bn_mul_mont aim to neatly
+# "overlap" multiplications and the other operations [and on most
+# platforms even minimize the amount of the other operations, in
+# particular references to memory]. But it's possible to improve this
+# module performance by implementing dedicated squaring code-path and
+# possibly by unrolling loops...
+
+# January 2009.
+#
+# Reschedule to minimize/avoid Address Generation Interlock hazard,
+# make inner loops counter-based.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better than
+# compiler-generated code, less for longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
+$mn0="%r0";
+$num="%r1";
+
+# int bn_mul_mont(
+$rp="%r2";		# BN_ULONG *rp,
+$ap="%r3";		# const BN_ULONG *ap,
+$bp="%r4";		# const BN_ULONG *bp,
+$np="%r5";		# const BN_ULONG *np,
+$n0="%r6";		# const BN_ULONG *n0,
+#$num="160(%r15)"	# int num);
+
+$bi="%r2";	# zaps rp
+$j="%r7";
+
+$ahi="%r8";
+$alo="%r9";
+$nhi="%r10";
+$nlo="%r11";
+$AHI="%r12";
+$NHI="%r13";
+$count="%r14";
+$sp="%r15";
+
+$code.=<<___;
+.text
+.globl	bn_mul_mont
+.type	bn_mul_mont,\@function
+bn_mul_mont:
+	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
+	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
+	la	$bp,0($num,$bp)
+
+	st${g}	%r2,2*$SIZE_T($sp)
+
+	cghi	$num,16		#
+	lghi	%r2,0		#
+	blr	%r14		# if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	tmll	$num,4
+	bnzr	%r14		# if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
+	cghi	$num,96		#
+	bhr	%r14		# if($num>96) return 0;
+___
+$code.=<<___;
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
+
+	lghi	$rp,-$stdframe-8	# leave room for carry bit
+	lcgr	$j,$num		# -$num
+	lgr	%r0,$sp
+	la	$rp,0($rp,$sp)
+	la	$sp,0($j,$rp)	# alloca
+	st${g}	%r0,0($sp)	# back chain
+
+	sra	$num,3		# restore $num
+	la	$bp,0($j,$bp)	# restore $bp
+	ahi	$num,-1		# adjust $num for inner loop
+	lg	$n0,0($n0)	# pull n0
+	_dswap	$n0
+
+	lg	$bi,0($bp)
+	_dswap	$bi
+	lg	$alo,0($ap)
+	_dswap	$alo
+	mlgr	$ahi,$bi	# ap[0]*bp[0]
+	lgr	$AHI,$ahi
+
+	lgr	$mn0,$alo	# "tp[0]"*n0
+	msgr	$mn0,$n0
+
+	lg	$nlo,0($np)	#
+	_dswap	$nlo
+	mlgr	$nhi,$mn0	# np[0]*m1
+	algr	$nlo,$alo	# +="tp[0]"
+	lghi	$NHI,0
+	alcgr	$NHI,$nhi
+
+	la	$j,8(%r0)	# j=1
+	lr	$count,$num
+
+.align	16
+.L1st:
+	lg	$alo,0($j,$ap)
+	_dswap	$alo
+	mlgr	$ahi,$bi	# ap[j]*bp[0]
+	algr	$alo,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$ahi
+
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	mlgr	$nhi,$mn0	# np[j]*m1
+	algr	$nlo,$NHI
+	lghi	$NHI,0
+	alcgr	$nhi,$NHI	# +="tp[j]"
+	algr	$nlo,$alo
+	alcgr	$NHI,$nhi
+
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
+	la	$j,8($j)	# j++
+	brct	$count,.L1st
+
+	algr	$NHI,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$AHI	# upmost overflow bit
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
+	la	$bp,8($bp)	# bp++
+
+.Louter:
+	lg	$bi,0($bp)	# bp[i]
+	_dswap	$bi
+	lg	$alo,0($ap)
+	_dswap	$alo
+	mlgr	$ahi,$bi	# ap[0]*bp[i]
+	alg	$alo,$stdframe($sp)	# +=tp[0]
+	lghi	$AHI,0
+	alcgr	$AHI,$ahi
+
+	lgr	$mn0,$alo
+	msgr	$mn0,$n0	# tp[0]*n0
+
+	lg	$nlo,0($np)	# np[0]
+	_dswap	$nlo
+	mlgr	$nhi,$mn0	# np[0]*m1
+	algr	$nlo,$alo	# +="tp[0]"
+	lghi	$NHI,0
+	alcgr	$NHI,$nhi
+
+	la	$j,8(%r0)	# j=1
+	lr	$count,$num
+
+.align	16
+.Linner:
+	lg	$alo,0($j,$ap)
+	_dswap	$alo
+	mlgr	$ahi,$bi	# ap[j]*bp[i]
+	algr	$alo,$AHI
+	lghi	$AHI,0
+	alcgr	$ahi,$AHI
+	alg	$alo,$stdframe($j,$sp)# +=tp[j]
+	alcgr	$AHI,$ahi
+
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	mlgr	$nhi,$mn0	# np[j]*m1
+	algr	$nlo,$NHI
+	lghi	$NHI,0
+	alcgr	$nhi,$NHI
+	algr	$nlo,$alo	# +="tp[j]"
+	alcgr	$NHI,$nhi
+
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
+	la	$j,8($j)	# j++
+	brct	$count,.Linner
+
+	algr	$NHI,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$AHI
+	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
+	lghi	$ahi,0
+	alcgr	$AHI,$ahi	# new upmost overflow bit
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
+
+	la	$bp,8($bp)	# bp++
+	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
+	jne	.Louter
+
+	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
+	la	$ap,$stdframe($sp)
+	ahi	$num,1		# restore $num, incidentally clears "borrow"
+
+	la	$j,0(%r0)
+	lr	$count,$num
+.Lsub:	lg	$alo,0($j,$ap)
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	slbgr	$alo,$nlo
+	stg	$alo,0($j,$rp)
+	la	$j,8($j)
+	brct	$count,.Lsub
+	lghi	$ahi,0
+	slbgr	$AHI,$ahi	# handle upmost carry
+
+	ngr	$ap,$AHI
+	lghi	$np,-1
+	xgr	$np,$AHI
+	ngr	$np,$rp
+	ogr	$ap,$np		# ap=borrow?tp:rp
+
+	la	$j,0(%r0)
+	lgr	$count,$num
+.Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
+	_dswap	$alo
+	stg	$j,$stdframe($j,$sp)	# zap tp
+	stg	$alo,0($j,$rp)
+	la	$j,8($j)
+	brct	$count,.Lcopy
+
+	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
+	lm${g}	%r6,%r15,0(%r1)
+	lghi	%r2,1		# signal "processed"
+	br	%r14
+.size	bn_mul_mont,.-bn_mul_mont
+.string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+	print $_,"\n";
+}
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/s390x.S
+++ b/openssl-1.0.2f/crypto/bn/asm/s390x.S
@@ -0,0 +1,713 @@
+.ident "s390x.S, version 1.1"
+// ====================================================================
+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+// project.
+//
+// Rights for redistribution and usage in source and binary forms are
+// granted according to the OpenSSL license. Warranty of any kind is
+// disclaimed.
+// ====================================================================
+
+.text
+
+#define zero	%r0
+
+// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl	bn_mul_add_words
+.type	bn_mul_add_words,@function
+.align	4
+bn_mul_add_words:
+	lghi	zero,0		// zero = 0
+	la	%r1,0(%r2)	// put rp aside [to give way to]
+	lghi	%r2,0		// return value
+	ltgfr	%r4,%r4
+	bler	%r14		// if (len<=0) return 0;
+
+	stmg	%r6,%r13,48(%r15)
+	lghi	%r2,3
+	lghi	%r12,0		// carry = 0
+	slgr	%r1,%r3		// rp-=ap
+	nr	%r2,%r4		// len%4
+	sra	%r4,2		// cnt=len/4
+	jz	.Loop1_madd	// carry is incidentally cleared if branch taken
+	algr	zero,zero	// clear carry
+
+	lg	%r7,0(%r3)	// ap[0]
+	lg	%r9,8(%r3)	// ap[1]
+	mlgr	%r6,%r5		// *=w
+	brct	%r4,.Loop4_madd
+	j	.Loop4_madd_tail
+
+.Loop4_madd:
+	mlgr	%r8,%r5
+	lg	%r11,16(%r3)	// ap[i+2]
+	alcgr	%r7,%r12	// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
+
+	mlgr	%r10,%r5
+	lg	%r13,24(%r3)
+	alcgr	%r9,%r6
+	alcgr	%r8,zero
+	alg	%r9,8(%r3,%r1)
+	stg	%r9,8(%r3,%r1)
+
+	mlgr	%r12,%r5
+	lg	%r7,32(%r3)
+	alcgr	%r11,%r8
+	alcgr	%r10,zero
+	alg	%r11,16(%r3,%r1)
+	stg	%r11,16(%r3,%r1)
+
+	mlgr	%r6,%r5
+	lg	%r9,40(%r3)
+	alcgr	%r13,%r10
+	alcgr	%r12,zero
+	alg	%r13,24(%r3,%r1)
+	stg	%r13,24(%r3,%r1)
+
+	la	%r3,32(%r3)	// i+=4
+	brct	%r4,.Loop4_madd
+
+.Loop4_madd_tail:
+	mlgr	%r8,%r5
+	lg	%r11,16(%r3)
+	alcgr	%r7,%r12	// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
+
+	mlgr	%r10,%r5
+	lg	%r13,24(%r3)
+	alcgr	%r9,%r6
+	alcgr	%r8,zero
+	alg	%r9,8(%r3,%r1)
+	stg	%r9,8(%r3,%r1)
+
+	mlgr	%r12,%r5
+	alcgr	%r11,%r8
+	alcgr	%r10,zero
+	alg	%r11,16(%r3,%r1)
+	stg	%r11,16(%r3,%r1)
+
+	alcgr	%r13,%r10
+	alcgr	%r12,zero
+	alg	%r13,24(%r3,%r1)
+	stg	%r13,24(%r3,%r1)
+
+	la	%r3,32(%r3)	// i+=4
+
+	la	%r2,1(%r2)	// see if len%4 is zero ...
+	brct	%r2,.Loop1_madd	// without touching condition code:-)
+
+.Lend_madd:
+	lgr	%r2,zero	// return value
+	alcgr	%r2,%r12	// collect even carry bit
+	lmg	%r6,%r13,48(%r15)
+	br	%r14
+
+.Loop1_madd:
+	lg	%r7,0(%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	alcgr	%r7,%r12	// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
+
+	lgr	%r12,%r6
+	la	%r3,8(%r3)	// i++
+	brct	%r2,.Loop1_madd
+
+	j	.Lend_madd
+.size	bn_mul_add_words,.-bn_mul_add_words
+
+// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl	bn_mul_words
+.type	bn_mul_words,@function
+.align	4
+bn_mul_words:
+	lghi	zero,0		// zero = 0
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0;
+	ltgfr	%r4,%r4
+	bler	%r14		// if (len<=0) return 0;
+
+	stmg	%r6,%r10,48(%r15)
+	lghi	%r10,3
+	lghi	%r8,0		// carry = 0
+	nr	%r10,%r4	// len%4
+	sra	%r4,2		// cnt=len/4
+	jz	.Loop1_mul	// carry is incidentally cleared if branch taken
+	algr	zero,zero	// clear carry
+
+.Loop4_mul:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	alcgr	%r7,%r8		// +=carry
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lg	%r9,8(%r2,%r3)
+	mlgr	%r8,%r5
+	alcgr	%r9,%r6
+	stg	%r9,8(%r2,%r1)
+
+	lg	%r7,16(%r2,%r3)
+	mlgr	%r6,%r5
+	alcgr	%r7,%r8
+	stg	%r7,16(%r2,%r1)
+
+	lg	%r9,24(%r2,%r3)
+	mlgr	%r8,%r5
+	alcgr	%r9,%r6
+	stg	%r9,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r4,.Loop4_mul
+
+	la	%r10,1(%r10)		// see if len%4 is zero ...
+	brct	%r10,.Loop1_mul		// without touching condition code:-)
+
+.Lend_mul:
+	alcgr	%r8,zero	// collect carry bit
+	lgr	%r2,%r8
+	lmg	%r6,%r10,48(%r15)
+	br	%r14
+
+.Loop1_mul:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	alcgr	%r7,%r8		// +=carry
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lgr	%r8,%r6
+	la	%r2,8(%r2)	// i++
+	brct	%r10,.Loop1_mul
+
+	j	.Lend_mul
+.size	bn_mul_words,.-bn_mul_words
+
+// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
+.globl	bn_sqr_words
+.type	bn_sqr_words,@function
+.align	4
+bn_sqr_words:
+	ltgfr	%r4,%r4
+	bler	%r14
+
+	stmg	%r6,%r7,48(%r15)
+	srag	%r1,%r4,2	// cnt=len/4
+	jz	.Loop1_sqr
+
+.Loop4_sqr:
+	lg	%r7,0(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,0(%r2)
+	stg	%r6,8(%r2)
+
+	lg	%r7,8(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,16(%r2)
+	stg	%r6,24(%r2)
+
+	lg	%r7,16(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,32(%r2)
+	stg	%r6,40(%r2)
+
+	lg	%r7,24(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,48(%r2)
+	stg	%r6,56(%r2)
+
+	la	%r3,32(%r3)
+	la	%r2,64(%r2)
+	brct	%r1,.Loop4_sqr
+
+	lghi	%r1,3
+	nr	%r4,%r1		// cnt=len%4
+	jz	.Lend_sqr
+
+.Loop1_sqr:
+	lg	%r7,0(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,0(%r2)
+	stg	%r6,8(%r2)
+
+	la	%r3,8(%r3)
+	la	%r2,16(%r2)
+	brct	%r4,.Loop1_sqr
+
+.Lend_sqr:
+	lmg	%r6,%r7,48(%r15)
+	br	%r14
+.size	bn_sqr_words,.-bn_sqr_words
+
+// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
+.globl	bn_div_words
+.type	bn_div_words,@function
+.align	4
+bn_div_words:
+	dlgr	%r2,%r4
+	lgr	%r2,%r3
+	br	%r14
+.size	bn_div_words,.-bn_div_words
+
+// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl	bn_add_words
+.type	bn_add_words,@function
+.align	4
+bn_add_words:
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0
+	ltgfr	%r5,%r5
+	bler	%r14		// if (len<=0) return 0;
+
+	stg	%r6,48(%r15)
+	lghi	%r6,3
+	nr	%r6,%r5		// len%4
+	sra	%r5,2		// len/4, use sra because it sets condition code
+	jz	.Loop1_add	// carry is incidentally cleared if branch taken
+	algr	%r2,%r2		// clear carry
+
+.Loop4_add:
+	lg	%r0,0(%r2,%r3)
+	alcg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+	lg	%r0,8(%r2,%r3)
+	alcg	%r0,8(%r2,%r4)
+	stg	%r0,8(%r2,%r1)
+	lg	%r0,16(%r2,%r3)
+	alcg	%r0,16(%r2,%r4)
+	stg	%r0,16(%r2,%r1)
+	lg	%r0,24(%r2,%r3)
+	alcg	%r0,24(%r2,%r4)
+	stg	%r0,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r5,.Loop4_add
+
+	la	%r6,1(%r6)	// see if len%4 is zero ...
+	brct	%r6,.Loop1_add	// without touching condition code:-)
+
+.Lexit_add:
+	lghi	%r2,0
+	alcgr	%r2,%r2
+	lg	%r6,48(%r15)
+	br	%r14
+
+.Loop1_add:
+	lg	%r0,0(%r2,%r3)
+	alcg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+
+	la	%r2,8(%r2)	// i++
+	brct	%r6,.Loop1_add
+
+	j	.Lexit_add
+.size	bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl	bn_sub_words
+.type	bn_sub_words,@function
+.align	4
+bn_sub_words:
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0
+	ltgfr	%r5,%r5
+	bler	%r14		// if (len<=0) return 0;
+
+	stg	%r6,48(%r15)
+	lghi	%r6,3
+	nr	%r6,%r5		// len%4
+	sra	%r5,2		// len/4, use sra because it sets condition code
+	jnz	.Loop4_sub	// borrow is incidentally cleared if branch taken
+	slgr	%r2,%r2		// clear borrow
+
+.Loop1_sub:
+	lg	%r0,0(%r2,%r3)
+	slbg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+
+	la	%r2,8(%r2)	// i++
+	brct	%r6,.Loop1_sub
+	j	.Lexit_sub
+
+.Loop4_sub:
+	lg	%r0,0(%r2,%r3)
+	slbg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+	lg	%r0,8(%r2,%r3)
+	slbg	%r0,8(%r2,%r4)
+	stg	%r0,8(%r2,%r1)
+	lg	%r0,16(%r2,%r3)
+	slbg	%r0,16(%r2,%r4)
+	stg	%r0,16(%r2,%r1)
+	lg	%r0,24(%r2,%r3)
+	slbg	%r0,24(%r2,%r4)
+	stg	%r0,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r5,.Loop4_sub
+
+	la	%r6,1(%r6)	// see if len%4 is zero ...
+	brct	%r6,.Loop1_sub	// without touching condition code:-)
+
+.Lexit_sub:
+	lghi	%r2,0
+	slbgr	%r2,%r2
+	lcgr	%r2,%r2
+	lg	%r6,48(%r15)
+	br	%r14
+.size	bn_sub_words,.-bn_sub_words
+
+#define c1	%r1
+#define c2	%r5
+#define c3	%r8
+
+#define mul_add_c(ai,bi,c1,c2,c3)	\
+	lg	%r7,ai*8(%r3);		\
+	mlg	%r6,bi*8(%r4);		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl	bn_mul_comba8
+.type	bn_mul_comba8,@function
+.align	4
+bn_mul_comba8:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	mul_add_c(0,0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(0,1,c2,c3,c1);
+	mul_add_c(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,0,c3,c1,c2);
+	mul_add_c(1,1,c3,c1,c2);
+	mul_add_c(0,2,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(0,3,c1,c2,c3);
+	mul_add_c(1,2,c1,c2,c3);
+	mul_add_c(2,1,c1,c2,c3);
+	mul_add_c(3,0,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(4,0,c2,c3,c1);
+	mul_add_c(3,1,c2,c3,c1);
+	mul_add_c(2,2,c2,c3,c1);
+	mul_add_c(1,3,c2,c3,c1);
+	mul_add_c(0,4,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(0,5,c3,c1,c2);
+	mul_add_c(1,4,c3,c1,c2);
+	mul_add_c(2,3,c3,c1,c2);
+	mul_add_c(3,2,c3,c1,c2);
+	mul_add_c(4,1,c3,c1,c2);
+	mul_add_c(5,0,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(6,0,c1,c2,c3);
+	mul_add_c(5,1,c1,c2,c3);
+	mul_add_c(4,2,c1,c2,c3);
+	mul_add_c(3,3,c1,c2,c3);
+	mul_add_c(2,4,c1,c2,c3);
+	mul_add_c(1,5,c1,c2,c3);
+	mul_add_c(0,6,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(0,7,c2,c3,c1);
+	mul_add_c(1,6,c2,c3,c1);
+	mul_add_c(2,5,c2,c3,c1);
+	mul_add_c(3,4,c2,c3,c1);
+	mul_add_c(4,3,c2,c3,c1);
+	mul_add_c(5,2,c2,c3,c1);
+	mul_add_c(6,1,c2,c3,c1);
+	mul_add_c(7,0,c2,c3,c1);
+	stg	c2,7*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(7,1,c3,c1,c2);
+	mul_add_c(6,2,c3,c1,c2);
+	mul_add_c(5,3,c3,c1,c2);
+	mul_add_c(4,4,c3,c1,c2);
+	mul_add_c(3,5,c3,c1,c2);
+	mul_add_c(2,6,c3,c1,c2);
+	mul_add_c(1,7,c3,c1,c2);
+	stg	c3,8*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(2,7,c1,c2,c3);
+	mul_add_c(3,6,c1,c2,c3);
+	mul_add_c(4,5,c1,c2,c3);
+	mul_add_c(5,4,c1,c2,c3);
+	mul_add_c(6,3,c1,c2,c3);
+	mul_add_c(7,2,c1,c2,c3);
+	stg	c1,9*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(7,3,c2,c3,c1);
+	mul_add_c(6,4,c2,c3,c1);
+	mul_add_c(5,5,c2,c3,c1);
+	mul_add_c(4,6,c2,c3,c1);
+	mul_add_c(3,7,c2,c3,c1);
+	stg	c2,10*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(4,7,c3,c1,c2);
+	mul_add_c(5,6,c3,c1,c2);
+	mul_add_c(6,5,c3,c1,c2);
+	mul_add_c(7,4,c3,c1,c2);
+	stg	c3,11*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(7,5,c1,c2,c3);
+	mul_add_c(6,6,c1,c2,c3);
+	mul_add_c(5,7,c1,c2,c3);
+	stg	c1,12*8(%r2)
+	lghi	c1,0
+
+
+	mul_add_c(6,7,c2,c3,c1);
+	mul_add_c(7,6,c2,c3,c1);
+	stg	c2,13*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(7,7,c3,c1,c2);
+	stg	c3,14*8(%r2)
+	stg	c1,15*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_mul_comba8,.-bn_mul_comba8
+
+// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl	bn_mul_comba4
+.type	bn_mul_comba4,@function
+.align	4
+bn_mul_comba4:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	mul_add_c(0,0,c1,c2,c3);
+	stg	c1,0*8(%r3)
+	lghi	c1,0
+
+	mul_add_c(0,1,c2,c3,c1);
+	mul_add_c(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,0,c3,c1,c2);
+	mul_add_c(1,1,c3,c1,c2);
+	mul_add_c(0,2,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(0,3,c1,c2,c3);
+	mul_add_c(1,2,c1,c2,c3);
+	mul_add_c(2,1,c1,c2,c3);
+	mul_add_c(3,0,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(3,1,c2,c3,c1);
+	mul_add_c(2,2,c2,c3,c1);
+	mul_add_c(1,3,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,3,c3,c1,c2);
+	mul_add_c(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(3,3,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	stg	c2,7*8(%r2)
+
+	stmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_mul_comba4,.-bn_mul_comba4
+
+#define sqr_add_c(ai,c1,c2,c3)		\
+	lg	%r7,ai*8(%r3);		\
+	mlgr	%r6,%r7;		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+#define sqr_add_c2(ai,aj,c1,c2,c3)	\
+	lg	%r7,ai*8(%r3);		\
+	mlg	%r6,aj*8(%r3);		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero;		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
+.globl	bn_sqr_comba8
+.type	bn_sqr_comba8,@function
+.align	4
+bn_sqr_comba8:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	sqr_add_c(0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(1,c3,c1,c2);
+	sqr_add_c2(2,0,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(3,0,c1,c2,c3);
+	sqr_add_c2(2,1,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(2,c2,c3,c1);
+	sqr_add_c2(3,1,c2,c3,c1);
+	sqr_add_c2(4,0,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(5,0,c3,c1,c2);
+	sqr_add_c2(4,1,c3,c1,c2);
+	sqr_add_c2(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(3,c1,c2,c3);
+	sqr_add_c2(4,2,c1,c2,c3);
+	sqr_add_c2(5,1,c1,c2,c3);
+	sqr_add_c2(6,0,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(7,0,c2,c3,c1);
+	sqr_add_c2(6,1,c2,c3,c1);
+	sqr_add_c2(5,2,c2,c3,c1);
+	sqr_add_c2(4,3,c2,c3,c1);
+	stg	c2,7*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(4,c3,c1,c2);
+	sqr_add_c2(5,3,c3,c1,c2);
+	sqr_add_c2(6,2,c3,c1,c2);
+	sqr_add_c2(7,1,c3,c1,c2);
+	stg	c3,8*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(7,2,c1,c2,c3);
+	sqr_add_c2(6,3,c1,c2,c3);
+	sqr_add_c2(5,4,c1,c2,c3);
+	stg	c1,9*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(5,c2,c3,c1);
+	sqr_add_c2(6,4,c2,c3,c1);
+	sqr_add_c2(7,3,c2,c3,c1);
+	stg	c2,10*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(7,4,c3,c1,c2);
+	sqr_add_c2(6,5,c3,c1,c2);
+	stg	c3,11*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(6,c1,c2,c3);
+	sqr_add_c2(7,5,c1,c2,c3);
+	stg	c1,12*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(7,6,c2,c3,c1);
+	stg	c2,13*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(7,c3,c1,c2);
+	stg	c3,14*8(%r2)
+	stg	c1,15*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_sqr_comba8,.-bn_sqr_comba8
+
+// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
+.globl bn_sqr_comba4
+.type	bn_sqr_comba4,@function
+.align	4
+bn_sqr_comba4:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	sqr_add_c(0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(1,c3,c1,c2);
+	sqr_add_c2(2,0,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(3,0,c1,c2,c3);
+	sqr_add_c2(2,1,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(2,c2,c3,c1);
+	sqr_add_c2(3,1,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(3,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	stg	c2,7*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_sqr_comba4,.-bn_sqr_comba4
--- a/openssl-1.0.2f/crypto/bn/asm/sparct4-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/sparct4-mont.pl
--- a/openssl-1.0.2f/crypto/bn/asm/sparcv8.S
+++ b/openssl-1.0.2f/crypto/bn/asm/sparcv8.S
--- a/openssl-1.0.2f/crypto/bn/asm/sparcv8plus.S
+++ b/openssl-1.0.2f/crypto/bn/asm/sparcv8plus.S
--- a/openssl-1.0.2f/crypto/bn/asm/sparcv9-gf2m.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/sparcv9-gf2m.pl
@@ -0,0 +1,190 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# October 2012
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: one suitable
+# for all SPARCv9 processors and one for VIS3-capable ones. Former
+# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
+# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
+# ~100-230% faster than gcc-generated code and ~35-90% faster than
+# the pure SPARCv9 code path.
+
+$locals=16*8;
+
+$tab="%l0";
+
+@T=("%g2","%g3");
+@i=("%g4","%g5");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
+($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
+
+$code.=<<___;
+#include <sparc_arch.h>
+
+#ifdef __arch64__
+.register	%g2,#scratch
+.register	%g3,#scratch
+#endif
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl	bn_GF2m_mul_2x2
+.align	16
+bn_GF2m_mul_2x2:
+        SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+        ld	[%g1+0],%g1             	! OPENSSL_sparcv9cap_P[0]
+
+        andcc	%g1, SPARCV9_VIS3, %g0
+        bz,pn	%icc,.Lsoftware
+        nop
+
+	sllx	%o1, 32, %o1
+	sllx	%o3, 32, %o3
+	or	%o2, %o1, %o1
+	or	%o4, %o3, %o3
+	.word	0x95b262ab			! xmulx   %o1, %o3, %o2
+	.word	0x99b262cb			! xmulxhi %o1, %o3, %o4
+	srlx	%o2, 32, %o1			! 13 cycles later
+	st	%o2, [%o0+0]
+	st	%o1, [%o0+4]
+	srlx	%o4, 32, %o3
+	st	%o4, [%o0+8]
+	retl
+	st	%o3, [%o0+12]
+
+.align	16
+.Lsoftware:
+	save	%sp,-STACK_FRAME-$locals,%sp
+
+	sllx	%i1,32,$a
+	mov	-1,$a12
+	sllx	%i3,32,$b
+	or	%i2,$a,$a
+	srlx	$a12,1,$a48			! 0x7fff...
+	or	%i4,$b,$b
+	srlx	$a12,2,$a12			! 0x3fff...
+	add	%sp,STACK_BIAS+STACK_FRAME,$tab
+
+	sllx	$a,2,$a4
+	mov	$a,$a1
+	sllx	$a,1,$a2
+
+	srax	$a4,63,@i[1]			! broadcast 61st bit
+	and	$a48,$a4,$a4			! (a<<2)&0x7fff...
+	srlx	$a48,2,$a48
+	srax	$a2,63,@i[0]			! broadcast 62nd bit
+	and	$a12,$a2,$a2			! (a<<1)&0x3fff...
+	srax	$a1,63,$lo			! broadcast 63rd bit
+	and	$a48,$a1,$a1			! (a<<0)&0x1fff...
+
+	sllx	$a1,3,$a8
+	and	$b,$lo,$lo
+	and	$b,@i[0],@i[0]
+	and	$b,@i[1],@i[1]
+
+	stx	%g0,[$tab+0*8]			! tab[0]=0
+	xor	$a1,$a2,$a12
+	stx	$a1,[$tab+1*8]			! tab[1]=a1
+	stx	$a2,[$tab+2*8]			! tab[2]=a2
+	 xor	$a4,$a8,$a48
+	stx	$a12,[$tab+3*8]			! tab[3]=a1^a2
+	 xor	$a4,$a1,$a1
+
+	stx	$a4,[$tab+4*8]			! tab[4]=a4
+	xor	$a4,$a2,$a2
+	stx	$a1,[$tab+5*8]			! tab[5]=a1^a4
+	xor	$a4,$a12,$a12
+	stx	$a2,[$tab+6*8]			! tab[6]=a2^a4
+	 xor	$a48,$a1,$a1
+	stx	$a12,[$tab+7*8]			! tab[7]=a1^a2^a4
+	 xor	$a48,$a2,$a2
+
+	stx	$a8,[$tab+8*8]			! tab[8]=a8
+	xor	$a48,$a12,$a12
+	stx	$a1,[$tab+9*8]			! tab[9]=a1^a8
+	 xor	$a4,$a1,$a1
+	stx	$a2,[$tab+10*8]			! tab[10]=a2^a8
+	 xor	$a4,$a2,$a2
+	stx	$a12,[$tab+11*8]		! tab[11]=a1^a2^a8
+
+	xor	$a4,$a12,$a12
+	stx	$a48,[$tab+12*8]		! tab[12]=a4^a8
+	 srlx	$lo,1,$hi
+	stx	$a1,[$tab+13*8]			! tab[13]=a1^a4^a8
+	 sllx	$lo,63,$lo
+	stx	$a2,[$tab+14*8]			! tab[14]=a2^a4^a8
+	 srlx	@i[0],2,@T[0]
+	stx	$a12,[$tab+15*8]		! tab[15]=a1^a2^a4^a8
+
+	sllx	@i[0],62,$a1
+	 sllx	$b,3,@i[0]
+	srlx	@i[1],3,@T[1]
+	 and	@i[0],`0xf<<3`,@i[0]
+	sllx	@i[1],61,$a2
+	 ldx	[$tab+@i[0]],@i[0]
+	 srlx	$b,4-3,@i[1]
+	xor	@T[0],$hi,$hi
+	 and	@i[1],`0xf<<3`,@i[1]
+	xor	$a1,$lo,$lo
+	 ldx	[$tab+@i[1]],@i[1]
+	xor	@T[1],$hi,$hi
+
+	xor	@i[0],$lo,$lo
+	srlx	$b,8-3,@i[0]
+	 xor	$a2,$lo,$lo
+	and	@i[0],`0xf<<3`,@i[0]
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+	sllx	@i[1],`$n*4`,@T[0]
+	ldx	[$tab+@i[0]],@i[0]
+	srlx	@i[1],`64-$n*4`,@T[1]
+	xor	@T[0],$lo,$lo
+	srlx	$b,`($n+2)*4`-3,@i[1]
+	xor	@T[1],$hi,$hi
+	and	@i[1],`0xf<<3`,@i[1]
+___
+	push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+	sllx	@i[1],`$n*4`,@T[0]
+	ldx	[$tab+@i[0]],@i[0]
+	srlx	@i[1],`64-$n*4`,@T[1]
+	xor	@T[0],$lo,$lo
+
+	sllx	@i[0],`($n+1)*4`,@T[0]
+	 xor	@T[1],$hi,$hi
+	srlx	@i[0],`64-($n+1)*4`,@T[1]
+	xor	@T[0],$lo,$lo
+	xor	@T[1],$hi,$hi
+
+	srlx	$lo,32,%i1
+	st	$lo,[%i0+0]
+	st	%i1,[%i0+4]
+	srlx	$hi,32,%i2
+	st	$hi,[%i0+8]
+	st	%i2,[%i0+12]
+
+	ret
+	restore
+.type	bn_GF2m_mul_2x2,#function
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz	"GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/sparcv9-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/sparcv9-mont.pl
@@ -0,0 +1,606 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# December 2005
+#
+# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
+# for undertaken effort are multiple. First of all, UltraSPARC is not
+# the whole SPARCv9 universe and other VIS-free implementations deserve
+# optimized code as much. Secondly, newly introduced UltraSPARC T1,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
+# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
+# several integrated RSA/DSA accelerator circuits accessible through
+# kernel driver [only(*)], but having decent user-land software
+# implementation is important too. Finally, reasons like desire to
+# experiment with dedicated squaring procedure. Yes, this module
+# implements one, because it was easiest to draft it in SPARCv9
+# instructions...
+
+# (*)	Engine accessing the driver in question is on my TODO list.
+#	For reference, acceleator is estimated to give 6 to 10 times
+#	improvement on single-threaded RSA sign. It should be noted
+#	that 6-10x improvement coefficient does not actually mean
+#	something extraordinary in terms of absolute [single-threaded]
+#	performance, as SPARCv9 instruction set is by all means least
+#	suitable for high performance crypto among other 64 bit
+#	platforms. 6-10x factor simply places T1 in same performance
+#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
+#	appear impressive at all, but it's the sign operation which is
+#	far more critical/interesting.
+
+# You might notice that inner loops are modulo-scheduled:-) This has
+# essentially negligible impact on UltraSPARC performance, it's
+# Fujitsu SPARC64 V users who should notice and hopefully appreciate
+# the advantage... Currently this module surpasses sparcv9a-mont.pl
+# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
+# module still have hidden potential [see TODO list there], which is
+# estimated to be larger than 20%...
+
+# int bn_mul_mont(
+$rp="%i0";	# BN_ULONG *rp,
+$ap="%i1";	# const BN_ULONG *ap,
+$bp="%i2";	# const BN_ULONG *bp,
+$np="%i3";	# const BN_ULONG *np,
+$n0="%i4";	# const BN_ULONG *n0,
+$num="%i5";	# int num);
+
+$bits=32;
+for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)	{ $bias=2047; $frame=192; }
+else		{ $bias=0;    $frame=128; }
+
+$car0="%o0";
+$car1="%o1";
+$car2="%o2";	# 1 bit
+$acc0="%o3";
+$acc1="%o4";
+$mask="%g1";	# 32 bits, what a waste...
+$tmp0="%g4";
+$tmp1="%g5";
+
+$i="%l0";
+$j="%l1";
+$mul0="%l2";
+$mul1="%l3";
+$tp="%l4";
+$apj="%l5";
+$npj="%l6";
+$tpj="%l7";
+
+$fname="bn_mul_mont_int";
+
+$code=<<___;
+.section	".text",#alloc,#execinstr
+
+.global	$fname
+.align	32
+$fname:
+	cmp	%o5,4			! 128 bits minimum
+	bge,pt	%icc,.Lenter
+	sethi	%hi(0xffffffff),$mask
+	retl
+	clr	%o0
+.align	32
+.Lenter:
+	save	%sp,-$frame,%sp
+	sll	$num,2,$num		! num*=4
+	or	$mask,%lo(0xffffffff),$mask
+	ld	[$n0],$n0
+	cmp	$ap,$bp
+	and	$num,$mask,$num
+	ld	[$bp],$mul0		! bp[0]
+	nop
+
+	add	%sp,$bias,%o7		! real top of stack
+	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
+	sub	%o7,$num,%o7
+	ld	[$ap+4],$apj		! ap[1]
+	and	%o7,-1024,%o7
+	ld	[$np],$car1		! np[0]
+	sub	%o7,$bias,%sp		! alloca
+	ld	[$np+4],$npj		! np[1]
+	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
+	mov	12,$j
+
+	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
+	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
+	and	$car0,$mask,$acc0
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap+8],$apj		!prologue!
+
+	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
+	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	ld	[$np+8],$npj		!prologue!
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0		!prologue!
+
+.L1st:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0
+	ld	[$ap+$j],$apj		! ap[j]
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj		! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	add	$j,4,$j			! j++
+	mov	$tmp0,$acc0
+	st	$car1,[$tp]
+	cmp	$j,$num
+	mov	$tmp1,$acc1
+	srlx	$car1,32,$car1
+	bl	%icc,.L1st
+	add	$tp,4,$tp		! tp++
+!.L1st
+
+	mulx	$apj,$mul0,$tmp0	!epilogue!
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$tmp0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car1,$car1
+	st	$car1,[$tp+8]
+	srlx	$car1,32,$car2
+
+	mov	4,$i			! i++
+	ld	[$bp+4],$mul0		! bp[1]
+.Louter:
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap],$car0		! ap[0]
+	ld	[$ap+4],$apj		! ap[1]
+	ld	[$np],$car1		! np[0]
+	ld	[$np+4],$npj		! np[1]
+	ld	[$tp],$tmp1		! tp[0]
+	ld	[$tp+4],$tpj		! tp[1]
+	mov	12,$j
+
+	mulx	$car0,$mul0,$car0
+	mulx	$apj,$mul0,$tmp0	!prologue!
+	add	$tmp1,$car0,$car0
+	ld	[$ap+8],$apj		!prologue!
+	and	$car0,$mask,$acc0
+
+	mulx	$n0,$acc0,$mul1
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1
+	mulx	$npj,$mul1,$acc1	!prologue!
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	ld	[$np+8],$npj		!prologue!
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0		!prologue!
+
+.Linner:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$tpj,$car0,$car0
+	ld	[$ap+$j],$apj		! ap[j]
+	add	$acc0,$car0,$car0
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj		! np[j]
+	and	$car0,$mask,$acc0
+	ld	[$tp+8],$tpj		! tp[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	add	$j,4,$j			! j++
+	mov	$tmp0,$acc0
+	st	$car1,[$tp]		! tp[j-1]
+	srlx	$car1,32,$car1
+	mov	$tmp1,$acc1
+	cmp	$j,$num
+	bl	%icc,.Linner
+	add	$tp,4,$tp		! tp++
+!.Linner
+
+	mulx	$apj,$mul0,$tmp0	!epilogue!
+	mulx	$npj,$mul1,$tmp1
+	add	$tpj,$car0,$car0
+	add	$acc0,$car0,$car0
+	ld	[$tp+8],$tpj		! tp[j]
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]		! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$tpj,$car0,$car0
+	add	$tmp0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]		! tp[j-1]
+	srlx	$car0,32,$car0
+	add	$i,4,$i			! i++
+	srlx	$car1,32,$car1
+
+	add	$car0,$car1,$car1
+	cmp	$i,$num
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+8]
+
+	srlx	$car1,32,$car2
+	bl,a	%icc,.Louter
+	ld	[$bp+$i],$mul0		! bp[i]
+!.Louter
+
+	add	$tp,12,$tp
+
+.Ltail:
+	add	$np,$num,$np
+	add	$rp,$num,$rp
+	mov	$tp,$ap
+	sub	%g0,$num,%o7		! k=-num
+	ba	.Lsub
+	subcc	%g0,%g0,%g0		! clear %icc.c
+.align	16
+.Lsub:
+	ld	[$tp+%o7],%o0
+	ld	[$np+%o7],%o1
+	subccc	%o0,%o1,%o1		! tp[j]-np[j]
+	add	$rp,%o7,$i
+	add	%o7,4,%o7
+	brnz	%o7,.Lsub
+	st	%o1,[$i]
+	subc	$car2,0,$car2		! handle upmost overflow bit
+	and	$tp,$car2,$ap
+	andn	$rp,$car2,$np
+	or	$ap,$np,$ap
+	sub	%g0,$num,%o7
+
+.Lcopy:
+	ld	[$ap+%o7],%o0		! copy or in-place refresh
+	st	%g0,[$tp+%o7]		! zap tp
+	st	%o0,[$rp+%o7]
+	add	%o7,4,%o7
+	brnz	%o7,.Lcopy
+	nop
+	mov	1,%i0
+	ret
+	restore
+___
+
+########
+######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
+######## code without following dedicated squaring procedure.
+########
+$sbit="%i2";		# re-use $bp!
+
+$code.=<<___;
+.align	32
+.Lbn_sqr_mont:
+	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
+	mulx	$apj,$mul0,$tmp0		!prologue!
+	and	$car0,$mask,$acc0
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap+8],$apj			!prologue!
+
+	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
+	srlx	$car0,32,$car0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
+	mulx	$npj,$mul1,$acc1		!prologue!
+	and	$car0,1,$sbit
+	ld	[$np+8],$npj			!prologue!
+	srlx	$car0,1,$car0
+	add	$acc0,$car1,$car1
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0			!prologue!
+
+.Lsqr_1st:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0		! ap[j]*a0+c0
+	add	$acc1,$car1,$car1
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	mov	$tmp1,$acc1
+	srlx	$acc0,32,$sbit
+	add	$j,4,$j				! j++
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	mov	$tmp0,$acc0
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_1st
+	add	$tp,4,$tp			! tp++
+!.Lsqr_1st
+
+	mulx	$apj,$mul0,$tmp0		! epilogue
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0		! ap[j]*a0+c0
+	add	$acc1,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
+	add	$tmp1,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	st	$car1,[$tp+8]
+	srlx	$car1,32,$car2
+
+	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
+	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
+	ld	[$ap+4],$mul0			! ap[1]
+	ld	[$ap+8],$apj			! ap[2]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp0,$mul1
+
+	mulx	$mul0,$mul0,$car0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1
+	mulx	$npj,$mul1,$acc1
+	add	$tmp0,$car1,$car1
+	and	$car0,$mask,$acc0
+	ld	[$np+8],$npj			! np[2]
+	srlx	$car1,32,$car1
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	and	$car0,1,$sbit
+	add	$acc1,$car1,$car1
+	srlx	$car0,1,$car0
+	mov	12,$j
+	st	$car1,[%sp+$bias+$frame]	! tp[0]=
+	srlx	$car1,32,$car1
+	add	%sp,$bias+$frame+4,$tp
+
+.Lsqr_2nd:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$acc0,$car0,$car0
+	add	$tpj,$car1,$car1
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc1,$car1,$car1
+	ld	[$tp+8],$tpj			! tp[j]
+	add	$acc0,$acc0,$acc0
+	add	$j,4,$j				! j++
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_2nd
+	add	$tp,4,$tp			! tp++
+!.Lsqr_2nd
+
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$acc0,$car0,$car0
+	add	$tpj,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc1,$car1,$car1
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
+	ld	[$ap+8],$mul0			! ap[2]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp1,$mul1
+	and	$mul1,$mask,$mul1
+	mov	8,$i
+
+	mulx	$mul0,$mul0,$car0
+	mulx	$car1,$mul1,$car1
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	%sp,$bias+$frame,$tp
+	srlx	$car1,32,$car1
+	and	$car0,1,$sbit
+	srlx	$car0,1,$car0
+	mov	4,$j
+
+.Lsqr_outer:
+.Lsqr_inner1:
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$j,4,$j
+	ld	[$tp+8],$tpj
+	cmp	$j,$i
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_inner1
+	add	$tp,4,$tp
+!.Lsqr_inner1
+
+	add	$j,4,$j
+	ld	[$ap+$j],$apj			! ap[j]
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	ld	[$np+$j],$npj			! np[j]
+	add	$acc0,$car1,$car1
+	ld	[$tp+8],$tpj			! tp[j]
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$j,4,$j
+	cmp	$j,$num
+	be,pn	%icc,.Lsqr_no_inner2
+	add	$tp,4,$tp
+
+.Lsqr_inner2:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car0,$car0
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	ld	[$tp+8],$tpj			! tp[j]
+	or	$sbit,$acc0,$acc0
+	add	$j,4,$j				! j++
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_inner2
+	add	$tp,4,$tp			! tp++
+
+.Lsqr_no_inner2:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car0,$car0
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	add	$i,4,$i				! i++
+	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
+	ld	[$ap+$i],$mul0			! ap[j]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp1,$mul1
+	and	$mul1,$mask,$mul1
+	add	$i,4,$tmp0
+
+	mulx	$mul0,$mul0,$car0
+	mulx	$car1,$mul1,$car1
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	%sp,$bias+$frame,$tp
+	srlx	$car1,32,$car1
+	and	$car0,1,$sbit
+	srlx	$car0,1,$car0
+
+	cmp	$tmp0,$num			! i<num-1
+	bl	%icc,.Lsqr_outer
+	mov	4,$j
+
+.Lsqr_last:
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$j,4,$j
+	ld	[$tp+8],$tpj
+	cmp	$j,$i
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_last
+	add	$tp,4,$tp
+!.Lsqr_last
+
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0		! recover $car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	ba	.Ltail
+	add	$tp,8,$tp
+.type	$fname,#function
+.size	$fname,(.-$fname)
+.asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	32
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/sparcv9a-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/sparcv9a-mont.pl
@@ -0,0 +1,882 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
+# Because unlike integer multiplier, which simply stalls whole CPU,
+# FPU is fully pipelined and can effectively emit 48 bit partial
+# product every cycle. Why not blended SPARC v9? One can argue that
+# making this module dependent on UltraSPARC VIS extension limits its
+# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
+# implementations from compatibility matrix. But the rest, whole Sun
+# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
+# VIS extension instructions used in this module. This is considered
+# good enough to not care about HAL SPARC64 users [if any] who have
+# integer-only pure SPARCv9 module to "fall down" to.
+
+# USI&II cores currently exhibit uniform 2x improvement [over pre-
+# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
+# performance improves few percents for shorter keys and worsens few
+# percents for longer keys. This is because USIII integer multiplier
+# is >3x faster than USI&II one, which is harder to match [but see
+# TODO list below]. It should also be noted that SPARC64 V features
+# out-of-order execution, which *might* mean that integer multiplier
+# is pipelined, which in turn *might* be impossible to match... On
+# additional note, SPARC64 V implements FP Multiply-Add instruction,
+# which is perfectly usable in this context... In other words, as far
+# as Fujitsu SPARC64 V goes, talk to the author:-)
+
+# The implementation implies following "non-natural" limitations on
+# input arguments:
+# - num may not be less than 4;
+# - num has to be even;
+# Failure to meet either condition has no fatal effects, simply
+# doesn't give any performance gain.
+
+# TODO:
+# - modulo-schedule inner loop for better performance (on in-order
+#   execution core such as UltraSPARC this shall result in further
+#   noticeable(!) improvement);
+# - dedicated squaring procedure[?];
+
+######################################################################
+# November 2006
+#
+# Modulo-scheduled inner loops allow to interleave floating point and
+# integer instructions and minimize Read-After-Write penalties. This
+# results in *further* 20-50% perfromance improvement [depending on
+# key length, more for longer keys] on USI&II cores and 30-80% - on
+# USIII&IV.
+
+$fname="bn_mul_mont_fpu";
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+
+if ($bits==64) {
+	$bias=2047;
+	$frame=192;
+} else {
+	$bias=0;
+	$frame=128;	# 96 rounded up to largest known cache-line
+}
+$locals=64;
+
+# In order to provide for 32-/64-bit ABI duality, I keep integers wider
+# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
+# exclusively for pointers, indexes and other small values...
+# int bn_mul_mont(
+$rp="%i0";	# BN_ULONG *rp,
+$ap="%i1";	# const BN_ULONG *ap,
+$bp="%i2";	# const BN_ULONG *bp,
+$np="%i3";	# const BN_ULONG *np,
+$n0="%i4";	# const BN_ULONG *n0,
+$num="%i5";	# int num);
+
+$tp="%l0";	# t[num]
+$ap_l="%l1";	# a[num],n[num] are smashed to 32-bit words and saved
+$ap_h="%l2";	# to these four vectors as double-precision FP values.
+$np_l="%l3";	# This way a bunch of fxtods are eliminated in second
+$np_h="%l4";	# loop and L1-cache aliasing is minimized...
+$i="%l5";
+$j="%l6";
+$mask="%l7";	# 16-bit mask, 0xffff
+
+$n0="%g4";	# reassigned(!) to "64-bit" register
+$carry="%i4";	# %i4 reused(!) for a carry bit
+
+# FP register naming chart
+#
+#     ..HILO
+#       dcba
+#   --------
+#        LOa
+#       LOb
+#      LOc
+#     LOd
+#      HIa
+#     HIb
+#    HIc
+#   HId
+#    ..a
+#   ..b
+$ba="%f0";    $bb="%f2";    $bc="%f4";    $bd="%f6";
+$na="%f8";    $nb="%f10";   $nc="%f12";   $nd="%f14";
+$alo="%f16";  $alo_="%f17"; $ahi="%f18";  $ahi_="%f19";
+$nlo="%f20";  $nlo_="%f21"; $nhi="%f22";  $nhi_="%f23";
+
+$dota="%f24"; $dotb="%f26";
+
+$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
+$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
+$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
+$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
+
+$ASI_FL16_P=0xD2;	# magic ASI value to engage 16-bit FP load
+
+$code=<<___;
+.section	".text",#alloc,#execinstr
+
+.global $fname
+.align  32
+$fname:
+	save	%sp,-$frame-$locals,%sp
+
+	cmp	$num,4
+	bl,a,pn %icc,.Lret
+	clr	%i0
+	andcc	$num,1,%g0		! $num has to be even...
+	bnz,a,pn %icc,.Lret
+	clr	%i0			! signal "unsupported input value"
+
+	srl	$num,1,$num
+	sethi	%hi(0xffff),$mask
+	ld	[%i4+0],$n0		! $n0 reassigned, remember?
+	or	$mask,%lo(0xffff),$mask
+	ld	[%i4+4],%o0
+	sllx	%o0,32,%o0
+	or	%o0,$n0,$n0		! $n0=n0[1].n0[0]
+
+	sll	$num,3,$num		! num*=8
+
+	add	%sp,$bias,%o0		! real top of stack
+	sll	$num,2,%o1
+	add	%o1,$num,%o1		! %o1=num*5
+	sub	%o0,%o1,%o0
+	and	%o0,-2048,%o0		! optimize TLB utilization
+	sub	%o0,$bias,%sp		! alloca(5*num*8)
+
+	rd	%asi,%o7		! save %asi
+	add	%sp,$bias+$frame+$locals,$tp
+	add	$tp,$num,$ap_l
+	add	$ap_l,$num,$ap_l	! [an]p_[lh] point at the vectors' ends !
+	add	$ap_l,$num,$ap_h
+	add	$ap_h,$num,$np_l
+	add	$np_l,$num,$np_h
+
+	wr	%g0,$ASI_FL16_P,%asi	! setup %asi for 16-bit FP loads
+
+	add	$rp,$num,$rp		! readjust input pointers to point
+	add	$ap,$num,$ap		! at the ends too...
+	add	$bp,$num,$bp
+	add	$np,$num,$np
+
+	stx	%o7,[%sp+$bias+$frame+48]	! save %asi
+
+	sub	%g0,$num,$i		! i=-num
+	sub	%g0,$num,$j		! j=-num
+
+	add	$ap,$j,%o3
+	add	$bp,$i,%o4
+
+	ld	[%o3+4],%g1		! bp[0]
+	ld	[%o3+0],%o0
+	ld	[%o4+4],%g5		! ap[0]
+	sllx	%g1,32,%g1
+	ld	[%o4+0],%o1
+	sllx	%g5,32,%g5
+	or	%g1,%o0,%o0
+	or	%g5,%o1,%o1
+
+	add	$np,$j,%o5
+
+	mulx	%o1,%o0,%o0		! ap[0]*bp[0]
+	mulx	$n0,%o0,%o0		! ap[0]*bp[0]*n0
+	stx	%o0,[%sp+$bias+$frame+0]
+
+	ld	[%o3+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o3+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	! transfer b[i] to FPU as 4x16-bit values
+	ldda	[%o4+2]%asi,$ba
+	fxtod	$alo,$alo
+	ldda	[%o4+0]%asi,$bb
+	fxtod	$ahi,$ahi
+	ldda	[%o4+6]%asi,$bc
+	fxtod	$nlo,$nlo
+	ldda	[%o4+4]%asi,$bd
+	fxtod	$nhi,$nhi
+
+	! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
+	ldda	[%sp+$bias+$frame+6]%asi,$na
+	fxtod	$ba,$ba
+	ldda	[%sp+$bias+$frame+4]%asi,$nb
+	fxtod	$bb,$bb
+	ldda	[%sp+$bias+$frame+2]%asi,$nc
+	fxtod	$bc,$bc
+	ldda	[%sp+$bias+$frame+0]%asi,$nd
+	fxtod	$bd,$bd
+
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+	fxtod	$na,$na
+	std	$ahi,[$ap_h+$j]
+	fxtod	$nb,$nb
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+	fxtod	$nc,$nc
+	std	$nhi,[$np_h+$j]
+	fxtod	$nd,$nd
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+		fmuld	$alo,$bd,$alod
+	faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+		fmuld	$ahi,$ba,$ahia
+	faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+		fmuld	$ahi,$bb,$ahib
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	add	$j,8,$j
+	std	$nlob,[%sp+$bias+$frame+8]
+	add	$ap,$j,%o4
+	std	$nloc,[%sp+$bias+$frame+16]
+	add	$np,$j,%o5
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o4+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	fxtod	$alo,$alo
+	fxtod	$ahi,$ahi
+	fxtod	$nlo,$nlo
+	fxtod	$nhi,$nhi
+
+	ldx	[%sp+$bias+$frame+0],%o0
+		fmuld	$alo,$ba,$aloa
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$nlo,$na,$nloa
+	ldx	[%sp+$bias+$frame+16],%o2
+		fmuld	$alo,$bb,$alob
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$nlo,$nb,$nlob
+
+	srlx	%o0,16,%o7
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+		fmuld	$alo,$bc,$aloc
+	add	%o7,%o1,%o1
+	std	$ahi,[$ap_h+$j]
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	srlx	%o1,16,%o7
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+		fmuld	$alo,$bd,$alod
+	add	%o7,%o2,%o2
+	std	$nhi,[$np_h+$j]
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	srlx	%o2,16,%o7
+		fmuld	$ahi,$ba,$ahia
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	!and	%o0,$mask,%o0
+	!and	%o1,$mask,%o1
+	!and	%o2,$mask,%o2
+	!sllx	%o1,16,%o1
+	!sllx	%o2,32,%o2
+	!sllx	%o3,48,%o7
+	!or	%o1,%o0,%o0
+	!or	%o2,%o0,%o0
+	!or	%o7,%o0,%o0		! 64-bit result
+	srlx	%o3,16,%g1		! 34-bit carry
+		fmuld	$ahi,$bb,$ahib
+
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$dota,$nloa,$nloa
+	faddd	$dotb,$nlob,$nlob
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	addcc	$j,8,$j
+	std	$nloc,[%sp+$bias+$frame+16]
+	bz,pn	%icc,.L1stskip
+	std	$nlod,[%sp+$bias+$frame+24]
+
+.align	32			! incidentally already aligned !
+.L1st:
+	add	$ap,$j,%o4
+	add	$np,$j,%o5
+	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o4+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	fxtod	$alo,$alo
+	fxtod	$ahi,$ahi
+	fxtod	$nlo,$nlo
+	fxtod	$nhi,$nhi
+
+	ldx	[%sp+$bias+$frame+0],%o0
+		fmuld	$alo,$ba,$aloa
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$nlo,$na,$nloa
+	ldx	[%sp+$bias+$frame+16],%o2
+		fmuld	$alo,$bb,$alob
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$nlo,$nb,$nlob
+
+	srlx	%o0,16,%o7
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+		fmuld	$alo,$bc,$aloc
+	add	%o7,%o1,%o1
+	std	$ahi,[$ap_h+$j]
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	srlx	%o1,16,%o7
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+		fmuld	$alo,$bd,$alod
+	add	%o7,%o2,%o2
+	std	$nhi,[$np_h+$j]
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	srlx	%o2,16,%o7
+		fmuld	$ahi,$ba,$ahia
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		fmuld	$ahi,$bb,$ahib
+	sllx	%o1,16,%o1
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	sllx	%o2,32,%o2
+		fmuld	$ahi,$bc,$ahic
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	or	%o2,%o0,%o0
+		fmuld	$ahi,$bd,$ahid
+	or	%o7,%o0,%o0		! 64-bit result
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	addcc	%g1,%o0,%o0
+		faddd	$dota,$nloa,$nloa
+	srlx	%o3,16,%g1		! 34-bit carry
+		faddd	$dotb,$nlob,$nlob
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]=
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	addcc	$j,8,$j
+	bnz,pt	%icc,.L1st
+	add	$tp,8,$tp
+
+.L1stskip:
+	fdtox	$dota,$dota
+	fdtox	$dotb,$dotb
+
+	ldx	[%sp+$bias+$frame+0],%o0
+	ldx	[%sp+$bias+$frame+8],%o1
+	ldx	[%sp+$bias+$frame+16],%o2
+	ldx	[%sp+$bias+$frame+24],%o3
+
+	srlx	%o0,16,%o7
+	std	$dota,[%sp+$bias+$frame+32]
+	add	%o7,%o1,%o1
+	std	$dotb,[%sp+$bias+$frame+40]
+	srlx	%o1,16,%o7
+	add	%o7,%o2,%o2
+	srlx	%o2,16,%o7
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+	sllx	%o1,16,%o1
+	sllx	%o2,32,%o2
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+	or	%o2,%o0,%o0
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[%sp+$bias+$frame+32],%o4
+	addcc	%g1,%o0,%o0
+	ldx	[%sp+$bias+$frame+40],%o5
+	srlx	%o3,16,%g1		! 34-bit carry
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]=
+	add	$tp,8,$tp
+
+	srlx	%o4,16,%o7
+	add	%o7,%o5,%o5
+	and	%o4,$mask,%o4
+	sllx	%o5,16,%o7
+	or	%o7,%o4,%o4
+	addcc	%g1,%o4,%o4
+	srlx	%o5,48,%g1
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	mov	%g1,$carry
+	stx	%o4,[$tp]		! tp[num-1]=
+
+	ba	.Louter
+	add	$i,8,$i
+.align	32
+.Louter:
+	sub	%g0,$num,$j		! j=-num
+	add	%sp,$bias+$frame+$locals,$tp
+
+	add	$ap,$j,%o3
+	add	$bp,$i,%o4
+
+	ld	[%o3+4],%g1		! bp[i]
+	ld	[%o3+0],%o0
+	ld	[%o4+4],%g5		! ap[0]
+	sllx	%g1,32,%g1
+	ld	[%o4+0],%o1
+	sllx	%g5,32,%g5
+	or	%g1,%o0,%o0
+	or	%g5,%o1,%o1
+
+	ldx	[$tp],%o2		! tp[0]
+	mulx	%o1,%o0,%o0
+	addcc	%o2,%o0,%o0
+	mulx	$n0,%o0,%o0		! (ap[0]*bp[i]+t[0])*n0
+	stx	%o0,[%sp+$bias+$frame+0]
+
+	! transfer b[i] to FPU as 4x16-bit values
+	ldda	[%o4+2]%asi,$ba
+	ldda	[%o4+0]%asi,$bb
+	ldda	[%o4+6]%asi,$bc
+	ldda	[%o4+4]%asi,$bd
+
+	! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
+	ldda	[%sp+$bias+$frame+6]%asi,$na
+	fxtod	$ba,$ba
+	ldda	[%sp+$bias+$frame+4]%asi,$nb
+	fxtod	$bb,$bb
+	ldda	[%sp+$bias+$frame+2]%asi,$nc
+	fxtod	$bc,$bc
+	ldda	[%sp+$bias+$frame+0]%asi,$nd
+	fxtod	$bd,$bd
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	fxtod	$na,$na
+	ldd	[$ap_h+$j],$ahi
+	fxtod	$nb,$nb
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	fxtod	$nc,$nc
+	ldd	[$np_h+$j],$nhi
+	fxtod	$nd,$nd
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+		fmuld	$alo,$bd,$alod
+	faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+		fmuld	$ahi,$ba,$ahia
+	faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+		fmuld	$ahi,$bb,$ahib
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	add	$j,8,$j
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	ldd	[$ap_h+$j],$ahi
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	ldd	[$np_h+$j],$nhi
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	ldx	[%sp+$bias+$frame+0],%o0
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$alo,$bd,$alod
+	ldx	[%sp+$bias+$frame+16],%o2
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$ahi,$ba,$ahia
+
+	srlx	%o0,16,%o7
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	add	%o7,%o1,%o1
+		fmuld	$ahi,$bb,$ahib
+	srlx	%o1,16,%o7
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	add	%o7,%o2,%o2
+		fmuld	$ahi,$bc,$ahic
+	srlx	%o2,16,%o7
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	! why?
+	and	%o0,$mask,%o0
+		fmuld	$ahi,$bd,$ahid
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	sllx	%o1,16,%o1
+		faddd	$dota,$nloa,$nloa
+	sllx	%o2,32,%o2
+		faddd	$dotb,$nlob,$nlob
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahic,$nhic,$dota	! $nhic
+	or	%o2,%o0,%o0
+		faddd	$ahid,$nhid,$dotb	! $nhid
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[$tp],%o7
+		faddd	$nloc,$nhia,$nloc
+	addcc	%o7,%o0,%o0
+	! end-of-why?
+		faddd	$nlod,$nhib,$nlod
+	srlx	%o3,16,%g1		! 34-bit carry
+		fdtox	$nloa,$nloa
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	addcc	$j,8,$j
+	std	$nloc,[%sp+$bias+$frame+16]
+	bz,pn	%icc,.Linnerskip
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ba	.Linner
+	nop
+.align	32
+.Linner:
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	ldd	[$ap_h+$j],$ahi
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	ldd	[$np_h+$j],$nhi
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	ldx	[%sp+$bias+$frame+0],%o0
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$alo,$bd,$alod
+	ldx	[%sp+$bias+$frame+16],%o2
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$ahi,$ba,$ahia
+
+	srlx	%o0,16,%o7
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	add	%o7,%o1,%o1
+		fmuld	$ahi,$bb,$ahib
+	srlx	%o1,16,%o7
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	add	%o7,%o2,%o2
+		fmuld	$ahi,$bc,$ahic
+	srlx	%o2,16,%o7
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+		fmuld	$ahi,$bd,$ahid
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	sllx	%o1,16,%o1
+		faddd	$dota,$nloa,$nloa
+	sllx	%o2,32,%o2
+		faddd	$dotb,$nlob,$nlob
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahic,$nhic,$dota	! $nhic
+	or	%o2,%o0,%o0
+		faddd	$ahid,$nhid,$dotb	! $nhid
+	or	%o7,%o0,%o0		! 64-bit result
+		faddd	$nloc,$nhia,$nloc
+	addcc	%g1,%o0,%o0
+	ldx	[$tp+8],%o7		! tp[j]
+		faddd	$nlod,$nhib,$nlod
+	srlx	%o3,16,%g1		! 34-bit carry
+		fdtox	$nloa,$nloa
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+		fdtox	$nlob,$nlob
+	addcc	%o7,%o0,%o0
+		fdtox	$nloc,$nloc
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]
+		fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	addcc	$j,8,$j
+	std	$nlod,[%sp+$bias+$frame+24]
+	bnz,pt	%icc,.Linner
+	add	$tp,8,$tp
+
+.Linnerskip:
+	fdtox	$dota,$dota
+	fdtox	$dotb,$dotb
+
+	ldx	[%sp+$bias+$frame+0],%o0
+	ldx	[%sp+$bias+$frame+8],%o1
+	ldx	[%sp+$bias+$frame+16],%o2
+	ldx	[%sp+$bias+$frame+24],%o3
+
+	srlx	%o0,16,%o7
+	std	$dota,[%sp+$bias+$frame+32]
+	add	%o7,%o1,%o1
+	std	$dotb,[%sp+$bias+$frame+40]
+	srlx	%o1,16,%o7
+	add	%o7,%o2,%o2
+	srlx	%o2,16,%o7
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+	sllx	%o1,16,%o1
+	sllx	%o2,32,%o2
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+	or	%o2,%o0,%o0
+	ldx	[%sp+$bias+$frame+32],%o4
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[%sp+$bias+$frame+40],%o5
+	addcc	%g1,%o0,%o0
+	ldx	[$tp+8],%o7		! tp[j]
+	srlx	%o3,16,%g1		! 34-bit carry
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	addcc	%o7,%o0,%o0
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]
+	add	$tp,8,$tp
+
+	srlx	%o4,16,%o7
+	add	%o7,%o5,%o5
+	and	%o4,$mask,%o4
+	sllx	%o5,16,%o7
+	or	%o7,%o4,%o4
+	addcc	%g1,%o4,%o4
+	srlx	%o5,48,%g1
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	addcc	$carry,%o4,%o4
+	stx	%o4,[$tp]		! tp[num-1]
+	mov	%g1,$carry
+	bcs,a	%xcc,.+8
+	add	$carry,1,$carry
+
+	addcc	$i,8,$i
+	bnz	%icc,.Louter
+	nop
+
+	add	$tp,8,$tp		! adjust tp to point at the end
+	orn	%g0,%g0,%g4
+	sub	%g0,$num,%o7		! n=-num
+	ba	.Lsub
+	subcc	%g0,%g0,%g0		! clear %icc.c
+
+.align	32
+.Lsub:
+	ldx	[$tp+%o7],%o0
+	add	$np,%o7,%g1
+	ld	[%g1+0],%o2
+	ld	[%g1+4],%o3
+	srlx	%o0,32,%o1
+	subccc	%o0,%o2,%o2
+	add	$rp,%o7,%g1
+	subccc	%o1,%o3,%o3
+	st	%o2,[%g1+0]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lsub
+	st	%o3,[%g1+4]
+	subc	$carry,0,%g4
+	sub	%g0,$num,%o7		! n=-num
+	ba	.Lcopy
+	nop
+
+.align	32
+.Lcopy:
+	ldx	[$tp+%o7],%o0
+	add	$rp,%o7,%g1
+	ld	[%g1+0],%o2
+	ld	[%g1+4],%o3
+	stx	%g0,[$tp+%o7]
+	and	%o0,%g4,%o0
+	srlx	%o0,32,%o1
+	andn	%o2,%g4,%o2
+	andn	%o3,%g4,%o3
+	or	%o2,%o0,%o0
+	or	%o3,%o1,%o1
+	st	%o0,[%g1+0]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lcopy
+	st	%o1,[%g1+4]
+	sub	%g0,$num,%o7		! n=-num
+
+.Lzap:
+	stx	%g0,[$ap_l+%o7]
+	stx	%g0,[$ap_h+%o7]
+	stx	%g0,[$np_l+%o7]
+	stx	%g0,[$np_h+%o7]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lzap
+	nop
+
+	ldx	[%sp+$bias+$frame+48],%o7
+	wr	%g0,%o7,%asi		! restore %asi
+
+	mov	1,%i0
+.Lret:
+	ret
+	restore
+.type   $fname,#function
+.size	$fname,(.-$fname)
+.asciz	"Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
+.align	32
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+# Below substitution makes it possible to compile without demanding
+# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
+# dare to do this, because VIS capability is detected at run-time now
+# and this routine is not called on CPU not capable to execute it. Do
+# note that fzeros is not the only VIS dependency! Another dependency
+# is implicit and is just _a_ numerical value loaded to %asi register,
+# which assembler can't recognize as VIS specific...
+$code =~ s/fzeros\s+%f([0-9]+)/
+	   sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
+	  /gem;
+
+print $code;
+# flush
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/via-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/via-mont.pl
@@ -0,0 +1,242 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Wrapper around 'rep montmul', VIA-specific instruction accessing
+# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
+# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
+#
+# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
+# different software configurations on 1.5GHz VIA Esther processor.
+# Lines marked with "software integer" denote performance of hand-
+# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
+# refers to hand-coded SSE2 Montgomery multiplication procedure found
+# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
+# Padlock SDK 2.0.1 available for download from VIA, which naturally
+# utilizes the magic 'repz montmul' instruction. And finally "hardware
+# this" refers to *this* implementation which also uses 'repz montmul'
+#
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.001720s 0.000140s    581.4   7149.7	software integer
+# rsa  512 bits 0.000690s 0.000086s   1450.3  11606.0	software SSE2
+# rsa  512 bits 0.006136s 0.000201s    163.0   4974.5	hardware VIA SDK
+# rsa  512 bits 0.000712s 0.000050s   1404.9  19858.5	hardware this
+#
+# rsa 1024 bits 0.008518s 0.000413s    117.4   2420.8	software integer
+# rsa 1024 bits 0.004275s 0.000277s    233.9   3609.7	software SSE2
+# rsa 1024 bits 0.012136s 0.000260s     82.4   3844.5	hardware VIA SDK
+# rsa 1024 bits 0.002522s 0.000116s    396.5   8650.9	hardware this
+#
+# rsa 2048 bits 0.050101s 0.001371s     20.0    729.6	software integer
+# rsa 2048 bits 0.030273s 0.001008s     33.0    991.9	software SSE2
+# rsa 2048 bits 0.030833s 0.000976s     32.4   1025.1	hardware VIA SDK
+# rsa 2048 bits 0.011879s 0.000342s     84.2   2921.7	hardware this
+#
+# rsa 4096 bits 0.327097s 0.004859s      3.1    205.8	software integer
+# rsa 4096 bits 0.229318s 0.003859s      4.4    259.2	software SSE2
+# rsa 4096 bits 0.233953s 0.003274s      4.3    305.4	hardware VIA SDK
+# rsa 4096 bits 0.070493s 0.001166s     14.2    857.6	hardware this
+#
+# dsa  512 bits 0.001342s 0.001651s    745.2    605.7	software integer
+# dsa  512 bits 0.000844s 0.000987s   1185.3   1013.1	software SSE2
+# dsa  512 bits 0.001902s 0.002247s    525.6    444.9	hardware VIA SDK
+# dsa  512 bits 0.000458s 0.000524s   2182.2   1909.1	hardware this
+#
+# dsa 1024 bits 0.003964s 0.004926s    252.3    203.0	software integer
+# dsa 1024 bits 0.002686s 0.003166s    372.3    315.8	software SSE2
+# dsa 1024 bits 0.002397s 0.002823s    417.1    354.3	hardware VIA SDK
+# dsa 1024 bits 0.000978s 0.001170s   1022.2    855.0	hardware this
+#
+# dsa 2048 bits 0.013280s 0.016518s     75.3     60.5	software integer
+# dsa 2048 bits 0.009911s 0.011522s    100.9     86.8	software SSE2
+# dsa 2048 bits 0.009542s 0.011763s    104.8     85.0	hardware VIA SDK
+# dsa 2048 bits 0.002884s 0.003352s    346.8    298.3	hardware this
+#
+# To give you some other reference point here is output for 2.4GHz P4
+# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
+# SSE2" in above terms.
+#
+# rsa  512 bits 0.000407s 0.000047s   2454.2  21137.0
+# rsa 1024 bits 0.002426s 0.000141s    412.1   7100.0
+# rsa 2048 bits 0.015046s 0.000491s     66.5   2034.9
+# rsa 4096 bits 0.109770s 0.002379s      9.1    420.3
+# dsa  512 bits 0.000438s 0.000525s   2281.1   1904.1
+# dsa 1024 bits 0.001346s 0.001595s    742.7    627.0
+# dsa 2048 bits 0.004745s 0.005582s    210.7    179.1
+#
+# Conclusions: 
+# - VIA SDK leaves a *lot* of room for improvement (which this
+#   implementation successfully fills:-);
+# - 'rep montmul' gives up to >3x performance improvement depending on
+#   key length;
+# - in terms of absolute performance it delivers approximately as much
+#   as modern out-of-order 32-bit cores [again, for longer keys].
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"via-mont.pl");
+
+# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+$func="bn_mul_mont_padlock";
+
+$pad=16*1;	# amount of reserved bytes on top of every vector
+
+# stack layout
+$mZeroPrime=&DWP(0,"esp");		# these are specified by VIA
+$A=&DWP(4,"esp");
+$B=&DWP(8,"esp");
+$T=&DWP(12,"esp");
+$M=&DWP(16,"esp");
+$scratch=&DWP(20,"esp");
+$rp=&DWP(24,"esp");			# these are mine
+$sp=&DWP(28,"esp");
+# &DWP(32,"esp")			# 32 byte scratch area
+# &DWP(64+(4*$num+$pad)*0,"esp")	# padded tp[num]
+# &DWP(64+(4*$num+$pad)*1,"esp")	# padded copy of ap[num]
+# &DWP(64+(4*$num+$pad)*2,"esp")	# padded copy of bp[num]
+# &DWP(64+(4*$num+$pad)*3,"esp")	# padded copy of np[num]
+# Note that SDK suggests to unconditionally allocate 2K per vector. This
+# has quite an impact on performance. It naturally depends on key length,
+# but to give an example 1024 bit private RSA key operations suffer >30%
+# penalty. I allocate only as much as actually required...
+
+&function_begin($func);
+	&xor	("eax","eax");
+	&mov	("ecx",&wparam(5));	# num
+	# meet VIA's limitations for num [note that the specification
+	# expresses them in bits, while we work with amount of 32-bit words]
+	&test	("ecx",3);
+	&jnz	(&label("leave"));	# num % 4 != 0
+	&cmp	("ecx",8);
+	&jb	(&label("leave"));	# num < 8
+	&cmp	("ecx",1024);
+	&ja	(&label("leave"));	# num > 1024
+
+	&pushf	();
+	&cld	();
+
+	&mov	("edi",&wparam(0));	# rp
+	&mov	("eax",&wparam(1));	# ap
+	&mov	("ebx",&wparam(2));	# bp
+	&mov	("edx",&wparam(3));	# np
+	&mov	("esi",&wparam(4));	# n0
+	&mov	("esi",&DWP(0,"esi"));	# *n0
+
+	&lea	("ecx",&DWP($pad,"","ecx",4));	# ecx becomes vector size in bytes
+	&lea	("ebp",&DWP(64,"","ecx",4));	# allocate 4 vectors + 64 bytes
+	&neg	("ebp");
+	&add	("ebp","esp");
+	&and	("ebp",-64);		# align to cache-line
+	&xchg	("ebp","esp");		# alloca
+
+	&mov	($rp,"edi");		# save rp
+	&mov	($sp,"ebp");		# save esp
+
+	&mov	($mZeroPrime,"esi");
+	&lea	("esi",&DWP(64,"esp"));	# tp
+	&mov	($T,"esi");
+	&lea	("edi",&DWP(32,"esp"));	# scratch area
+	&mov	($scratch,"edi");
+	&mov	("esi","eax");
+
+	&lea	("ebp",&DWP(-$pad,"ecx"));
+	&shr	("ebp",2);		# restore original num value in ebp
+
+	&xor	("eax","eax");
+
+	&mov	("ecx","ebp");
+	&lea	("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	&mov	("ecx","ebp");
+	&lea	("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
+	&mov	($A,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded ap copy...
+
+	&mov	("ecx","ebp");
+	&mov	("esi","ebx");
+	&mov	($B,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded bp copy...
+
+	&mov	("ecx","ebp");
+	&mov	("esi","edx");
+	&mov	($M,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded np copy...
+
+	# let magic happen...
+	&mov	("ecx","ebp");
+	&mov	("esi","esp");
+	&shl	("ecx",5);		# convert word counter to bit counter
+	&align	(4);
+	&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
+
+	&mov	("ecx","ebp");
+	&lea	("esi",&DWP(64,"esp"));		# tp
+	# edi still points at the end of padded np copy...
+	&neg	("ebp");
+	&lea	("ebp",&DWP(-$pad,"edi","ebp",4));	# so just "rewind"
+	&mov	("edi",$rp);			# restore rp
+	&xor	("edx","edx");			# i=0 and clear CF
+
+&set_label("sub",8);
+	&mov	("eax",&DWP(0,"esi","edx",4));
+	&sbb	("eax",&DWP(0,"ebp","edx",4));
+	&mov	(&DWP(0,"edi","edx",4),"eax");	# rp[i]=tp[i]-np[i]
+	&lea	("edx",&DWP(1,"edx"));		# i++
+	&loop	(&label("sub"));		# doesn't affect CF!
+
+	&mov	("eax",&DWP(0,"esi","edx",4));	# upmost overflow bit
+	&sbb	("eax",0);
+	&and	("esi","eax");
+	&not	("eax");
+	&mov	("ebp","edi");
+	&and	("ebp","eax");
+	&or	("esi","ebp");			# tp=carry?tp:rp
+
+	&mov	("ecx","edx");			# num
+	&xor	("edx","edx");			# i=0
+
+&set_label("copy",8);
+	&mov	("eax",&DWP(0,"esi","edx",4));
+	&mov	(&DWP(64,"esp","edx",4),"ecx");	# zap tp
+	&mov	(&DWP(0,"edi","edx",4),"eax");
+	&lea	("edx",&DWP(1,"edx"));		# i++
+	&loop	(&label("copy"));
+
+	&mov	("ebp",$sp);
+	&xor	("eax","eax");
+
+	&mov	("ecx",64/4);
+	&mov	("edi","esp");		# zap frame including scratch area
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	# zap copies of ap, bp and np
+	&lea	("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
+	&lea	("ecx",&DWP(3*$pad/4,"edx","edx",2));
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	&mov	("esp","ebp");
+	&inc	("eax");		# signal "done"
+	&popf	();
+&set_label("leave");
+&function_end($func);
+
+&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
--- a/openssl-1.0.2f/crypto/bn/asm/vis3-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/vis3-mont.pl
@@ -0,0 +1,373 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2012.
+#
+# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
+# onward. There are three new instructions used here: umulxhi,
+# addxc[cc] and initializing store. On T3 RSA private key operations
+# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
+# lengths. This is without dedicated squaring procedure. On T4
+# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
+# for reference purposes, because T4 has dedicated Montgomery
+# multiplication and squaring *instructions* that deliver even more.
+
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+
+$code.=<<___ if ($bits==64);
+.register	%g2,#scratch
+.register	%g3,#scratch
+___
+$code.=<<___;
+.section	".text",#alloc,#execinstr
+___
+
+($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
+	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
+
+# int bn_mul_mont(
+$rp="%o0";	# BN_ULONG *rp,
+$ap="%o1";	# const BN_ULONG *ap,
+$bp="%o2";	# const BN_ULONG *bp,
+$np="%o3";	# const BN_ULONG *np,
+$n0p="%o4";	# const BN_ULONG *n0,
+$num="%o5";	# int num);	# caller ensures that num is even
+				# and >=6
+$code.=<<___;
+.globl	bn_mul_mont_vis3
+.align	32
+bn_mul_mont_vis3:
+	add	%sp,	$bias,	%g4	! real top of stack
+	sll	$num,	2,	$num	! size in bytes
+	add	$num,	63,	%g5
+	andn	%g5,	63,	%g5	! buffer size rounded up to 64 bytes
+	add	%g5,	%g5,	%g1
+	add	%g5,	%g1,	%g1	! 3*buffer size
+	sub	%g4,	%g1,	%g1
+	andn	%g1,	63,	%g1	! align at 64 byte
+	sub	%g1,	$frame,	%g1	! new top of stack
+	sub	%g1,	%g4,	%g1
+
+	save	%sp,	%g1,	%sp
+___
+
+#	+-------------------------------+<-----	%sp
+#	.				.
+#	+-------------------------------+<-----	aligned at 64 bytes
+#	| __int64 tmp[0]		|
+#	+-------------------------------+
+#	.				.
+#	.				.
+#	+-------------------------------+<----- aligned at 64 bytes
+#	| __int64 ap[1..0]		|	converted ap[]
+#	+-------------------------------+
+#	| __int64 np[1..0]		|	converted np[]
+#	+-------------------------------+
+#	| __int64 ap[3..2]		|
+#	.				.
+#	.				.
+#	+-------------------------------+
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+$code.=<<___;
+	ld	[$n0p+0],	$t0	! pull n0[0..1] value
+	add	%sp, $bias+$frame, $tp
+	ld	[$n0p+4],	$t1
+	add	$tp,	%g5,	$anp
+	ld	[$bp+0],	$t2	! m0=bp[0]
+	sllx	$t1,	32,	$n0
+	ld	[$bp+4],	$t3
+	or	$t0,	$n0,	$n0
+	add	$bp,	8,	$bp
+
+	ld	[$ap+0],	$t0	! ap[0]
+	sllx	$t3,	32,	$m0
+	ld	[$ap+4],	$t1
+	or	$t2,	$m0,	$m0
+
+	ld	[$ap+8],	$t2	! ap[1]
+	sllx	$t1,	32,	$aj
+	ld	[$ap+12],	$t3
+	or	$t0,	$aj,	$aj
+	add	$ap,	16,	$ap
+	stx	$aj,	[$anp]		! converted ap[0]
+
+	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
+	umulxhi	$aj,	$m0,	$hi0
+
+	ld	[$np+0],	$t0	! np[0]
+	sllx	$t3,	32,	$aj
+	ld	[$np+4],	$t1
+	or	$t2,	$aj,	$aj
+
+	ld	[$np+8],	$t2	! np[1]
+	sllx	$t1,	32,	$nj
+	ld	[$np+12],	$t3
+	or	$t0, $nj,	$nj
+	add	$np,	16,	$np
+	stx	$nj,	[$anp+8]	! converted np[0]
+
+	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
+	stx	$aj,	[$anp+16]	! converted ap[1]
+
+	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+
+	mulx	$nj,	$m1,	$lo1	! np[0]*m1
+	umulxhi	$nj,	$m1,	$hi1
+
+	sllx	$t3,	32,	$nj
+	or	$t2,	$nj,	$nj
+	stx	$nj,	[$anp+24]	! converted np[1]
+	add	$anp,	32,	$anp
+
+	addcc	$lo0,	$lo1,	$lo1
+	addxc	%g0,	$hi1,	$hi1
+
+	mulx	$nj,	$m1,	$nlo	! np[1]*m1
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+
+	ba	.L1st
+	sub	$num,	24,	$cnt	! cnt=num-3
+
+.align	16
+.L1st:
+	ld	[$ap+0],	$t0	! ap[j]
+	addcc	$alo,	$hi0,	$lo0
+	ld	[$ap+4],	$t1
+	addxc	$aj,	%g0,	$hi0
+
+	sllx	$t1,	32,	$aj
+	add	$ap,	8,	$ap
+	or	$t0,	$aj,	$aj
+	stx	$aj,	[$anp]		! converted ap[j]
+
+	ld	[$np+0],	$t2	! np[j]
+	addcc	$nlo,	$hi1,	$lo1
+	ld	[$np+4],	$t3
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+
+	sllx	$t3,	32,	$nj
+	add	$np,	8,	$np
+	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
+	or	$t2,	$nj,	$nj
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	stx	$nj,	[$anp+8]	! converted np[j]
+	add	$anp,	16,	$anp	! anp++
+
+	mulx	$nj,	$m1,	$nlo	! np[j]*m1
+	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp	! tp++
+
+	brnz,pt	$cnt,	.L1st
+	sub	$cnt,	8,	$cnt	! j--
+!.L1st
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+
+	addcc	$nlo,	$hi1,	$lo1
+	addxc	$nj,	%g0,	$hi1
+	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp
+
+	addcc	$hi0,	$hi1,	$hi1
+	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
+	stx	$hi1,	[$tp]
+	add	$tp,	8,	$tp
+
+	ba	.Louter
+	sub	$num,	16,	$i	! i=num-2
+
+.align	16
+.Louter:
+	ld	[$bp+0],	$t2	! m0=bp[i]
+	ld	[$bp+4],	$t3
+
+	sub	$anp,	$num,	$anp	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$anp,	$num,	$anp
+
+	add	$bp,	8,	$bp
+	sllx	$t3,	32,	$m0
+	ldx	[$anp+0],	$aj	! ap[0]
+	or	$t2,	$m0,	$m0
+	ldx	[$anp+8],	$nj	! np[0]
+
+	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
+	ldx	[$tp],		$tj	! tp[0]
+	umulxhi	$aj,	$m0,	$hi0
+	ldx	[$anp+16],	$aj	! ap[1]
+	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
+	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
+	addxc	%g0,	$hi0,	$hi0
+	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	mulx	$nj,	$m1,	$lo1	! np[0]*m1
+	umulxhi	$nj,	$m1,	$hi1
+	ldx	[$anp+24],	$nj	! np[1]
+	add	$anp,	32,	$anp
+	addcc	$lo1,	$lo0,	$lo1
+	mulx	$nj,	$m1,	$nlo	! np[1]*m1
+	addxc	%g0,	$hi1,	$hi1
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+
+	ba	.Linner
+	sub	$num,	24,	$cnt	! cnt=num-3
+.align	16
+.Linner:
+	addcc	$alo,	$hi0,	$lo0
+	ldx	[$tp+8],	$tj	! tp[j]
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+	ldx	[$anp+0],	$aj	! ap[j]
+	addcc	$nlo,	$hi1,	$lo1
+	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+	ldx	[$anp+8],	$nj	! np[j]
+	add	$anp,	16,	$anp
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
+	mulx	$nj,	$m1,	$nlo	! np[j]*m1
+	addxc	%g0,	$hi0,	$hi0
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp
+	brnz,pt	$cnt,	.Linner
+	sub	$cnt,	8,	$cnt
+!.Linner
+	ldx	[$tp+8],	$tj	! tp[j]
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi0,	$hi0
+
+	addcc	$nlo,	$hi1,	$lo1
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+
+	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
+	addxccc	$hi1,	$hi0,	$hi1
+	addxc	%g0,	%g0,	$ovf
+	stx	$hi1,	[$tp+8]
+	add	$tp,	16,	$tp
+
+	brnz,pt	$i,	.Louter
+	sub	$i,	8,	$i
+
+	sub	$anp,	$num,	$anp	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$anp,	$num,	$anp
+	ba	.Lsub
+	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
+
+.align	16
+.Lsub:
+	ldx	[$tp],		$tj
+	add	$tp,	8,	$tp
+	ldx	[$anp+8],	$nj
+	add	$anp,	16,	$anp
+	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
+	srlx	$tj,	32,	$tj
+	srlx	$nj,	32,	$nj
+	subccc	$tj,	$nj,	$t3
+	add	$rp,	8,	$rp
+	st	$t2,	[$rp-4]		! reverse order
+	st	$t3,	[$rp-8]
+	brnz,pt	$cnt,	.Lsub
+	sub	$cnt,	8,	$cnt
+
+	sub	$anp,	$num,	$anp	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$anp,	$num,	$anp
+	sub	$rp,	$num,	$rp
+
+	subc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
+	and	$tp,	$ovf,	$ap
+	andn	$rp,	$ovf,	$np
+	or	$np,	$ap,	$ap	! ap=borrow?tp:rp
+	ba	.Lcopy
+	sub	$num,	8,	$cnt
+
+.align	16
+.Lcopy:					! copy or in-place refresh
+	ld	[$ap+0],	$t2
+	ld	[$ap+4],	$t3
+	add	$ap,	8,	$ap
+	stx	%g0,	[$tp]		! zap
+	add	$tp,	8,	$tp
+	stx	%g0,	[$anp]		! zap
+	stx	%g0,	[$anp+8]
+	add	$anp,	16,	$anp
+	st	$t3,	[$rp+0]		! flip order
+	st	$t2,	[$rp+4]
+	add	$rp,	8,	$rp
+	brnz	$cnt,	.Lcopy
+	sub	$cnt,	8,	$cnt
+
+	mov	1,	%o0
+	ret
+	restore
+.type	bn_mul_mont_vis3, #function
+.size	bn_mul_mont_vis3, .-bn_mul_mont_vis3
+.asciz  "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = (	"addxc"		=> 0x011,
+		"addxccc"	=> 0x013,
+		"umulxhi"	=> 0x016	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%([goli])([0-9])/);
+	    $_=$bias{$1}+$2;
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unvis3($1,$2,$3,$4)
+	 /ge;
+
+	print $_,"\n";
+}
+
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/vms.mar
+++ b/openssl-1.0.2f/crypto/bn/asm/vms.mar
--- a/openssl-1.0.2f/crypto/bn/asm/x86-gf2m.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86-gf2m.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has three code paths: pure integer
+# code suitable for any x86 CPU, MMX code suitable for PIII and later
+# and PCLMULQDQ suitable for Westmere and later. Improvement varies
+# from one benchmark and µ-arch to another. Below are interval values
+# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
+# code:
+#
+# PIII		16%-30%
+# P4		12%-12%
+# Opteron	18%-40%
+# Core2		19%-44%
+# Atom		38%-64%
+# Westmere	53%-121%(PCLMULQDQ)/20%-32%(MMX)
+# Sandy Bridge	72%-127%(PCLMULQDQ)/27%-23%(MMX)
+#
+# Note that above improvement coefficients are not coefficients for
+# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
+# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
+# is more and more dominated by other subroutines, most notably by
+# BN_GF2m_mod[_mul]_arr...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+$a="eax";
+$b="ebx";
+($a1,$a2,$a4)=("ecx","edx","ebp");
+
+$R="mm0";
+@T=("mm1","mm2");
+($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
+@i=("esi","edi");
+
+					if (!$x86only) {
+&function_begin_B("_mul_1x1_mmx");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &and	($a1,0x3fffffff);
+	 &lea	($a4,&DWP(0,$a2,$a2));
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	&movd	($A,$a);
+	&movd	($B,$b);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	&pxor	($B31,$B31);
+	&pxor	($B30,$B30);
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	&pcmpgtd($B31,$A);		# broadcast 31st bit
+	&paddd	($A,$A);		# $A<<=1
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	&pand	($B31,$B);
+	&pcmpgtd($B30,$A);		# broadcast 30th bit
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&psllq	($B31,31);
+	&pand	($B30,$B);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&mov	(@i[0],0x7);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	 &mov	($a4,@i[0]);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	&mov	(@i[1],$a4);
+	&psllq	($B30,30);
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	&movd	($R,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],$a4);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],$a4);
+		&psllq	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	($b,3);
+		&pxor	($R,@T[1]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+	&pxor	($R,$B30);
+	&psllq	(@T[1],3*$n++);
+	&pxor	($R,@T[1]);
+
+	&movd	(@T[0],&DWP(0,"esp",@i[0],4));
+	&pxor	($R,$B31);
+	&psllq	(@T[0],3*$n);
+	&add	("esp",32+4);
+	&pxor	($R,@T[0]);
+	&ret	();
+&function_end_B("_mul_1x1_mmx");
+					}
+
+($lo,$hi)=("eax","edx");
+@T=("ecx","ebp");
+
+&function_begin_B("_mul_1x1_ialu");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &lea	($a4,&DWP(0,"",$a,4));
+	 &and	($a1,0x3fffffff);
+	&lea	(@i[1],&DWP(0,$lo,$lo));
+	&sar	($lo,31);		# broadcast 31st bit
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&sar	(@i[1],31);		# broardcast 30th bit
+	&and	($lo,$b);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&and	(@i[1],$b);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	&mov	($hi,$lo);
+	&shl	($lo,31);
+	&mov	(@T[0],@i[1]);
+	&shr	($hi,1);
+
+	 &mov	(@i[0],0x7);
+	&shl	(@i[1],30);
+	 &and	(@i[0],$b);
+	&shr	(@T[0],2);
+	&xor	($lo,@i[1]);
+
+	&shr	($b,3);
+	&mov	(@i[1],0x7);		# 5-byte instruction!?
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	 &xor	($hi,@T[0]);
+	&xor	($lo,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],0x7);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],0x7);
+		&mov	(@T[0],@T[1]);
+		&shl	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	(@T[0],32-3*$n);
+		&xor	($lo,@T[1]);
+		&shr	($b,3);
+		&xor	($hi,@T[0]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+	&mov	(@T[0],@T[1]);
+	&shl	(@T[1],3*$n);
+	&mov	(@i[1],&DWP(0,"esp",@i[0],4));
+	&shr	(@T[0],32-3*$n);	$n++;
+	&mov	(@i[0],@i[1]);
+	&xor	($lo,@T[1]);
+	&shl	(@i[1],3*$n);
+	&xor	($hi,@T[0]);
+	&shr	(@i[0],32-3*$n);
+	&xor	($lo,@i[1]);
+	&xor	($hi,@i[0]);
+
+	&add	("esp",32+4);
+	&ret	();
+&function_end_B("_mul_1x1_ialu");
+
+# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+&function_begin_B("bn_GF2m_mul_2x2");
+if (!$x86only) {
+	&picmeup("edx","OPENSSL_ia32cap_P");
+	&mov	("eax",&DWP(0,"edx"));
+	&mov	("edx",&DWP(4,"edx"));
+	&test	("eax",1<<23);		# check MMX bit
+	&jz	(&label("ialu"));
+if ($sse2) {
+	&test	("eax",1<<24);		# check FXSR bit
+	&jz	(&label("mmx"));
+	&test	("edx",1<<1);		# check PCLMULQDQ bit
+	&jz	(&label("mmx"));
+
+	&movups		("xmm0",&QWP(8,"esp"));
+	&shufps		("xmm0","xmm0",0b10110001);
+	&pclmulqdq	("xmm0","xmm0",1);
+	&mov		("eax",&DWP(4,"esp"));
+	&movups		(&QWP(0,"eax"),"xmm0");
+	&ret	();
+
+&set_label("mmx",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_mmx");	# a1·b1
+	&movq	("mm7",$R);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# a0·b0
+	&movq	("mm6",$R);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# (a0+a1)·(b0+b1)
+	&pxor	($R,"mm7");
+	&mov	($a,&wparam(0));
+	&pxor	($R,"mm6");		# (a0+a1)·(b0+b1)-a1·b1-a0·b0
+
+	&movq	($A,$R);
+	&psllq	($R,32);
+	&pop	("edi");
+	&psrlq	($A,32);
+	&pop	("esi");
+	&pxor	($R,"mm6");
+	&pop	("ebx");
+	&pxor	($A,"mm7");
+	&movq	(&QWP(0,$a),$R);
+	&pop	("ebp");
+	&movq	(&QWP(8,$a),$A);
+	&emms	();
+	&ret	();
+&set_label("ialu",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&stack_push(4+1);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_ialu");	# a1·b1
+	&mov	(&DWP(8,"esp"),$lo);
+	&mov	(&DWP(12,"esp"),$hi);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# a0·b0
+	&mov	(&DWP(0,"esp"),$lo);
+	&mov	(&DWP(4,"esp"),$hi);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# (a0+a1)·(b0+b1)
+
+	&mov	("ebp",&wparam(0));
+		 @r=("ebx","ecx","edi","esi");
+	&mov	(@r[0],&DWP(0,"esp"));
+	&mov	(@r[1],&DWP(4,"esp"));
+	&mov	(@r[2],&DWP(8,"esp"));
+	&mov	(@r[3],&DWP(12,"esp"));
+
+	&xor	($lo,$hi);
+	&xor	($hi,@r[1]);
+	&xor	($lo,@r[0]);
+	&mov	(&DWP(0,"ebp"),@r[0]);
+	&xor	($hi,@r[2]);
+	&mov	(&DWP(12,"ebp"),@r[3]);
+	&xor	($lo,@r[3]);
+	&stack_pop(4+1);
+	&xor	($hi,@r[3]);
+	&pop	("edi");
+	&xor	($lo,$hi);
+	&pop	("esi");
+	&mov	(&DWP(8,"ebp"),$hi);
+	&pop	("ebx");
+	&mov	(&DWP(4,"ebp"),$lo);
+	&pop	("ebp");
+	&ret	();
+&function_end_B("bn_GF2m_mul_2x2");
+
+&asciz	("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
--- a/openssl-1.0.2f/crypto/bn/asm/x86-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86-mont.pl
@@ -0,0 +1,593 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
+$rp="edi";	$bp="edi";		# overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");			# stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;				# size of above frame rounded up to 16n
+
+	&xor	("eax","eax");
+	&mov	("edi",&wparam(5));	# int num
+	&cmp	("edi",4);
+	&jl	(&label("just_leave"));
+
+	&lea	("esi",&wparam(0));	# put aside pointer to argument block
+	&lea	("edx",&wparam(1));	# load ap
+	&mov	("ebp","esp");		# saved stack pointer!
+	&add	("edi",2);		# extra two words on top of tp
+	&neg	("edi");
+	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
+	&neg	("edi");
+
+	# minimize cache contention by arraning 2K window between stack
+	# pointer and ap argument [np is also position sensitive vector,
+	# but it's assumed to be near ap, as it's allocated at ~same
+	# time].
+	&mov	("eax","esp");
+	&sub	("eax","edx");
+	&and	("eax",2047);
+	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
+
+	&xor	("edx","esp");
+	&and	("edx",2048);
+	&xor	("edx",2048);
+	&sub	("esp","edx");		# this splits them apart modulo 4096
+
+	&and	("esp",-64);		# align to cache line
+
+	################################# load argument block...
+	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+	#&mov	("edi",&DWP(5*4,"esi"));# int num
+
+	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
+	&mov	($_rp,"eax");		# ... save a copy of argument block
+	&mov	($_ap,"ebx");
+	&mov	($_bp,"ecx");
+	&mov	($_np,"edx");
+	&mov	($_n0,"esi");
+	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"ebp");		# saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+	&picmeup("eax","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"eax"),26);
+	&jnc	(&label("non_sse2"));
+
+	&mov	("eax",-1);
+	&movd	($mask,"eax");		# mask 32 lower bits
+
+	&mov	($ap,$_ap);		# load input pointers
+	&mov	($bp,$_bp);
+	&mov	($np,$_np);
+
+	&xor	($i,$i);		# i=0
+	&xor	($j,$j);		# j=0
+
+	&movd	($mul0,&DWP(0,$bp));		# bp[0]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
+	&movq	($car0,$mul1);
+	&movq	($acc0,$mul1);			# I wish movd worked for
+	&pand	($acc0,$mask);			# inter-register transfers
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
+	&paddq	($car1,$acc0);
+
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&inc	($j);				# j++
+&set_label("1st",16);
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
+	&psrlq	($car1,32);
+
+	&lea	($j,&DWP(1,$j));
+	&cmp	($j,$num);
+	&jl	(&label("1st"));
+
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&paddq	($car1,$car0);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&inc	($i);				# i++
+&set_label("outer");
+	&xor	($j,$j);			# j=0
+
+	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
+
+	&paddq	($mul1,$temp);			# +=tp[0]
+	&movq	($acc0,$mul1);
+	&movq	($car0,$mul1);
+	&pand	($acc0,$mask);
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);
+	&paddq	($car1,$acc0);
+
+	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[1]
+
+	&inc	($j);				# j++
+	&dec	($num);
+&set_label("inner");
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[j+1]
+
+	&dec	($num);
+	&lea	($j,&DWP(1,$j));		# j++
+	&jnz	(&label("inner"));
+
+	&mov	($num,$j);
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
+	&paddq	($car1,$car0);
+	&paddq	($car1,$temp);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&lea	($i,&DWP(1,$i));		# i++
+	&cmp	($i,$num);
+	&jle	(&label("outer"));
+
+	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+	&mov	("esp",$_sp);
+	&xor	("eax","eax");	# signal "not fast enough [yet]"
+	&jmp	(&label("just_leave"));
+	# While the below code provides competitive performance for
+	# all key lengthes on modern Intel cores, it's still more
+	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+	# means compared to the original integer-only assembler.
+	# 512-bit RSA sign is better by ~40%, but that's about all
+	# one can say about all CPUs...
+} else {
+$inp="esi";	# integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+	&mov	($inp,$_ap);
+	&lea	($carry,&DWP(1,$num));
+	&mov	($word,$_bp);
+	&xor	($j,$j);				# j=0
+	&mov	("edx",$inp);
+	&and	($carry,1);				# see if num is even
+	&sub	("edx",$word);				# see if ap==bp
+	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
+	&or	($carry,"edx");
+	&mov	($word,&DWP(0,$word));			# bp[0]
+	&jz	(&label("bn_sqr_mont"));
+	&mov	($_bpend,"eax");
+	&mov	("eax",&DWP(0,$inp));
+	&xor	("edx","edx");
+
+&set_label("mull",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[0]
+	&add	($carry,"eax");
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("mull"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[0]
+	 &mov	($word,$_n0);
+	&add	("eax",$carry);
+	 &mov	($inp,$_np);
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&xor	($j,$j);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&inc	($j);
+
+	&jmp	(&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[i]
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("1stmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[i]
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&xor	($j,$j);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
+	&adc	($j,0);
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&mov	($j,1);
+
+&set_label("2ndmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
+	&jl	(&label("2ndmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&xor	("eax","eax");
+	 &mov	($j,$_bp);				# &bp[i]
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	 &lea	($j,&DWP(4,$j));
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	 &cmp	($j,$_bpend);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(0,$j));			# bp[i+1]
+	&mov	($inp,$_ap);
+	&mov	($_bp,$j);				# &bp[++i]
+	&xor	($j,$j);
+	&xor	("edx","edx");
+	&mov	("eax",&DWP(0,$inp));
+	&jmp	(&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+	&mov	($_num,$num);
+	&mov	($_bp,$j);				# i=0
+
+	&mov	("eax",$word);				# ap[0]
+	&mul	($word);				# ap[0]*ap[0]
+	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
+	&mov	($sbit,"edx");
+	&shr	("edx",1);
+	&and	($sbit,1);
+	&inc	($j);
+&set_label("sqr",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[0]
+	&add	("eax",$carry);
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	&shr	("eax",31);
+	&cmp	($j,$_num);
+	&mov	($sbit,"eax");
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("sqr"));
+
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*ap[0]
+	&add	("eax",$carry);
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&shr	("eax",31);
+	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
+
+	&lea	($carry,&DWP(0,"eax","edx",2));
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&shr	("edx",31);
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	($num,$j);
+	&adc	("edx",0);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&mov	($j,1);
+
+&set_label("3rdmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j+1]*m
+	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
+	&lea	($j,&DWP(2,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("3rdmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&mov	($j,$_bp);				# i
+	&xor	("eax","eax");
+	&mov	($inp,$_ap);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	&cmp	($j,$num);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
+	&lea	($j,&DWP(1,$j));
+	&mov	("eax",$word);
+	&mov	($_bp,$j);				# ++i
+	&mul	($word);				# ap[i]*ap[i]
+	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
+	&adc	("edx",0);
+	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
+	&xor	($carry,$carry);
+	&cmp	($j,$num);
+	&lea	($j,&DWP(1,$j));
+	&je	(&label("sqrlast"));
+
+	&mov	($sbit,"edx");				# zaps $num
+	&shr	("edx",1);
+	&and	($sbit,1);
+&set_label("sqradd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[i]
+	&add	("eax",$carry);
+	&lea	($carry,&DWP(0,"eax","eax"));
+	&adc	("edx",0);
+	&shr	("eax",31);
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("eax",0);
+	&add	($carry,$sbit);
+	&adc	("eax",0);
+	&cmp	($j,$_num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&mov	($sbit,"eax");
+	&jle	(&label("sqradd"));
+
+	&mov	($carry,"edx");
+	&add	("edx","edx");
+	&shr	($carry,31);
+	&add	("edx",$sbit);
+	&adc	($carry,0);
+&set_label("sqrlast");
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&adc	($carry,0);
+	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&lea	($num,&DWP(-1,$j));
+	&adc	("edx",0);
+	&mov	($j,1);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+
+	&jmp	(&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+	&mov	($np,$_np);			# load modulus pointer
+	&mov	($rp,$_rp);			# load result pointer
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
+	&mov	($j,$num);			# j=num-1
+	&xor	($i,$i);			# i=0 and clear CF!
+
+&set_label("sub",16);
+	&sbb	("eax",&DWP(0,$np,$i,4));
+	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
+	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
+	&jge	(&label("sub"));
+
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&and	($tp,"eax");
+	&not	("eax");
+	&mov	($np,$rp);
+	&and	($np,"eax");
+	&or	($tp,$np);			# tp=carry?tp:rp
+
+&set_label("copy",16);				# copy or in-place refresh
+	&mov	("eax",&DWP(0,$tp,$num,4));
+	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&dec	($num);
+	&jge	(&label("copy"));
+
+	&mov	("esp",$_sp);		# pull saved stack pointer
+	&mov	("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
--- a/openssl-1.0.2f/crypto/bn/asm/x86.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86.pl
@@ -0,0 +1,28 @@
+#!/usr/local/bin/perl
+
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+
+require("x86/mul_add.pl");
+require("x86/mul.pl");
+require("x86/sqr.pl");
+require("x86/div.pl");
+require("x86/add.pl");
+require("x86/sub.pl");
+require("x86/comba.pl");
+
+&asm_init($ARGV[0],$0);
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
--- a/openssl-1.0.2f/crypto/bn/asm/x86/add.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86/add.pl
@@ -0,0 +1,76 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+1;
--- a/openssl-1.0.2f/crypto/bn/asm/x86/comba.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86/comba.pl
@@ -0,0 +1,277 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub mul_add_c
+	{
+	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("mul a[$ai]*b[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	&mul("edx");
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
+	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
+	}
+
+sub sqr_add_c
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
+	}
+
+sub sqr_add_c2
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$a,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add("eax","eax");
+	 ###
+	&adc("edx","edx");
+	 ###
+	&adc($c2,0);
+	 &add($c0,"eax");
+	&adc($c1,"edx");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
+	&adc($c2,0);
+	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+	 ###
+	}
+
+sub bn_mul_comba
+	{
+	local($name,$num)=@_;
+	local($a,$b,$c0,$c1,$c2);
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($tot,$end);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$b="edi";
+	
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	&push("esi");
+	 &mov($a,&wparam(1));
+	&push("edi");
+	 &mov($b,&wparam(2));
+	&push("ebp");
+	 &push("ebx");
+
+	&xor($c0,$c0);
+	 &mov("eax",&DWP(0,$a,"",0));	# load the first word 
+	&xor($c1,$c1);
+	 &mov("edx",&DWP(0,$b,"",0));	# load the first second 
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("################## Calculate word $i"); 
+
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($j+1) == $end)
+				{
+				$v=1;
+				$v=2 if (($i+1) == $tot);
+				}
+			else
+				{ $v=0; }
+			if (($j+1) != $end)
+				{
+				$na=($ai-1);
+				$nb=($bi+1);
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				# &mov("eax",&wparam(0));
+				# &mov(&DWP($i*4,"eax","",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&comment("save r[$i]");
+	# &mov("eax",&wparam(0));
+	&mov(&DWP($i*4,"eax","",0),$c0);
+
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_sqr_comba
+	{
+	local($name,$num)=@_;
+	local($r,$a,$c0,$c1,$c2)=@_;
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($b,$tot,$end,$half);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$r="edi";
+
+	&push("esi");
+	 &push("edi");
+	&push("ebp");
+	 &push("ebx");
+	&mov($r,&wparam(0));
+	 &mov($a,&wparam(1));
+	&xor($c0,$c0);
+	 &xor($c1,$c1);
+	&mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("############### Calculate word $i");
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($ai-1) < ($bi+1))
+				{
+				$v=1;
+				$v=2 if ($i+1) == $tot;
+				}
+			else
+				{ $v=0; }
+			if (!$v)
+				{
+				$na=$ai-1;
+				$nb=$bi+1;
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+			if ($ai == $bi)
+				{
+				&sqr_add_c($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			else
+				{
+				&sqr_add_c2($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				#&mov(&DWP($i*4,$r,"",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				last;
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&mov(&DWP($i*4,$r,"",0),$c0);
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+1;
--- a/openssl-1.0.2f/crypto/bn/asm/x86/div.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86/div.pl
@@ -0,0 +1,15 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_div_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+	&mov("edx",&wparam(0));	#
+	&mov("eax",&wparam(1));	#
+	&mov("ebx",&wparam(2));	#
+	&div("ebx");
+	&function_end($name);
+	}
+1;
--- a/openssl-1.0.2f/crypto/bn/asm/x86/f
+++ b/openssl-1.0.2f/crypto/bn/asm/x86/f
@@ -0,0 +1,3 @@
+#!/usr/local/bin/perl
+# x86 assember
+
--- a/openssl-1.0.2f/crypto/bn/asm/x86/mul.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86/mul.pl
@@ -0,0 +1,77 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_mul_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ecx";
+	$r="edi";
+	$c="esi";
+	$num="ebp";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+	&mov($w,&wparam(3));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("mw_finish"));
+
+	&set_label("mw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jz(&label("mw_finish"));
+	&jmp(&label("mw_loop"));
+
+	&set_label("mw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jnz(&label("mw_finish2"));
+	&jmp(&label("mw_end"));
+
+	&set_label("mw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		 &dec($num) if ($i != 7-1);
+		&jz(&label("mw_end")) if ($i != 7-1);
+		}
+	&set_label("mw_end",0);
+	&mov("eax",$c);
+
+	&function_end($name);
+	}
+
+1;
--- a/openssl-1.0.2f/crypto/bn/asm/x86/mul_add.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86/mul_add.pl
@@ -0,0 +1,87 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_mul_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ebp";
+	$r="edi";
+	$c="esi";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+
+	&mov("ecx",&wparam(2));	#
+	&mov($a,&wparam(1));	#
+
+	&and("ecx",0xfffffff8);	# num / 8
+	&mov($w,&wparam(3));	#
+
+	&push("ecx");		# Up the stack for a tmp variable
+
+	&jz(&label("maw_finish"));
+
+	&set_label("maw_loop",0);
+
+	&mov(&swtmp(0),"ecx");	#
+
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);		# L(t)+= *r
+		 &mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",$c);		# L(t)+=c
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&mov("ecx",&swtmp(0));	#
+	&add($a,32);
+	&add($r,32);
+	&sub("ecx",8);
+	&jnz(&label("maw_loop"));
+
+	&set_label("maw_finish",0);
+	&mov("ecx",&wparam(2));	# get num
+	&and("ecx",7);
+	&jnz(&label("maw_finish2"));	# helps branch prediction
+	&jmp(&label("maw_end"));
+
+	&set_label("maw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 &mov($c,&DWP($i*4,$r,"",0));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",$c);
+		&adc("edx",0);			# H(t)+=carry
+		 &dec("ecx") if ($i != 7-1);
+		&mov(&DWP($i*4,$r,"",0),"eax");	# *r= L(t);
+		 &mov($c,"edx");			# c=  H(t);
+		&jz(&label("maw_end")) if ($i != 7-1);
+		}
+	&set_label("maw_end",0);
+	&mov("eax",$c);
+
+	&pop("ecx");	# clear variable from
+
+	&function_end($name);
+	}
+
+1;
--- a/openssl-1.0.2f/crypto/bn/asm/x86/sqr.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86/sqr.pl
@@ -0,0 +1,60 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_sqr_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$r="esi";
+	$a="edi";
+	$num="ebx";
+
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("sw_finish"));
+
+	&set_label("sw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+		&mov("eax",&DWP($i,$a,"",0)); 	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*2,$r,"",0),"eax");	#
+		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,64);
+	&sub($num,8);
+	&jnz(&label("sw_loop"));
+
+	&set_label("sw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jz(&label("sw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov("eax",&DWP($i*4,$a,"",0));	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*8,$r,"",0),"eax");	#
+		 &dec($num) if ($i != 7-1);
+		&mov(&DWP($i*8+4,$r,"",0),"edx");
+		 &jz(&label("sw_end")) if ($i != 7-1);
+		}
+	&set_label("sw_end",0);
+
+	&function_end($name);
+	}
+
+1;
--- a/openssl-1.0.2f/crypto/bn/asm/x86/sub.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86/sub.pl
@@ -0,0 +1,76 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_sub_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+1;
--- a/openssl-1.0.2f/crypto/bn/asm/x86_64-gcc.c
+++ b/openssl-1.0.2f/crypto/bn/asm/x86_64-gcc.c
@@ -0,0 +1,638 @@
+#include "../bn_lcl.h"
+#if !(defined(__GNUC__) && __GNUC__>=2)
+# include "../bn_asm.c"         /* kind of dirty hack for Sun Studio */
+#else
+/*-
+ * x86_64 BIGNUM accelerator version 0.1, December 2002.
+ *
+ * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project.
+ *
+ * Rights for redistribution and usage in source and binary forms are
+ * granted according to the OpenSSL license. Warranty of any kind is
+ * disclaimed.
+ *
+ * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
+ *    versions, like 1.0...
+ * A. Well, that's because this code is basically a quick-n-dirty
+ *    proof-of-concept hack. As you can see it's implemented with
+ *    inline assembler, which means that you're bound to GCC and that
+ *    there might be enough room for further improvement.
+ *
+ * Q. Why inline assembler?
+ * A. x86_64 features own ABI which I'm not familiar with. This is
+ *    why I decided to let the compiler take care of subroutine
+ *    prologue/epilogue as well as register allocation. For reference.
+ *    Win64 implements different ABI for AMD64, different from Linux.
+ *
+ * Q. How much faster does it get?
+ * A. 'apps/openssl speed rsa dsa' output with no-asm:
+ *
+ *                        sign    verify    sign/s verify/s
+ *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
+ *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
+ *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
+ *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
+ *                        sign    verify    sign/s verify/s
+ *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
+ *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
+ *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
+ *
+ *    'apps/openssl speed rsa dsa' output with this module:
+ *
+ *                        sign    verify    sign/s verify/s
+ *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
+ *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
+ *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
+ *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
+ *                        sign    verify    sign/s verify/s
+ *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
+ *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
+ *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
+ *
+ *    For the reference. IA-32 assembler implementation performs
+ *    very much like 64-bit code compiled with no-asm on the same
+ *    machine.
+ */
+
+# if defined(_WIN64) || !defined(__LP64__)
+#  define BN_ULONG unsigned long long
+# else
+#  define BN_ULONG unsigned long
+# endif
+
+# undef mul
+# undef mul_add
+
+/*-
+ * "m"(a), "+m"(r)      is the way to favor DirectPath µ-code;
+ * "g"(0)               let the compiler to decide where does it
+ *                      want to keep the value of zero;
+ */
+# define mul_add(r,a,word,carry) do {   \
+        register BN_ULONG high,low;     \
+        asm ("mulq %3"                  \
+                : "=a"(low),"=d"(high)  \
+                : "a"(word),"m"(a)      \
+                : "cc");                \
+        asm ("addq %2,%0; adcq %3,%1"   \
+                : "+r"(carry),"+d"(high)\
+                : "a"(low),"g"(0)       \
+                : "cc");                \
+        asm ("addq %2,%0; adcq %3,%1"   \
+                : "+m"(r),"+d"(high)    \
+                : "r"(carry),"g"(0)     \
+                : "cc");                \
+        carry=high;                     \
+        } while (0)
+
+# define mul(r,a,word,carry) do {       \
+        register BN_ULONG high,low;     \
+        asm ("mulq %3"                  \
+                : "=a"(low),"=d"(high)  \
+                : "a"(word),"g"(a)      \
+                : "cc");                \
+        asm ("addq %2,%0; adcq %3,%1"   \
+                : "+r"(carry),"+d"(high)\
+                : "a"(low),"g"(0)       \
+                : "cc");                \
+        (r)=carry, carry=high;          \
+        } while (0)
+# undef sqr
+# define sqr(r0,r1,a)                   \
+        asm ("mulq %2"                  \
+                : "=a"(r0),"=d"(r1)     \
+                : "a"(a)                \
+                : "cc");
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
+                          BN_ULONG w)
+{
+    BN_ULONG c1 = 0;
+
+    if (num <= 0)
+        return (c1);
+
+    while (num & ~3) {
+        mul_add(rp[0], ap[0], w, c1);
+        mul_add(rp[1], ap[1], w, c1);
+        mul_add(rp[2], ap[2], w, c1);
+        mul_add(rp[3], ap[3], w, c1);
+        ap += 4;
+        rp += 4;
+        num -= 4;
+    }
+    if (num) {
+        mul_add(rp[0], ap[0], w, c1);
+        if (--num == 0)
+            return c1;
+        mul_add(rp[1], ap[1], w, c1);
+        if (--num == 0)
+            return c1;
+        mul_add(rp[2], ap[2], w, c1);
+        return c1;
+    }
+
+    return (c1);
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
+{
+    BN_ULONG c1 = 0;
+
+    if (num <= 0)
+        return (c1);
+
+    while (num & ~3) {
+        mul(rp[0], ap[0], w, c1);
+        mul(rp[1], ap[1], w, c1);
+        mul(rp[2], ap[2], w, c1);
+        mul(rp[3], ap[3], w, c1);
+        ap += 4;
+        rp += 4;
+        num -= 4;
+    }
+    if (num) {
+        mul(rp[0], ap[0], w, c1);
+        if (--num == 0)
+            return c1;
+        mul(rp[1], ap[1], w, c1);
+        if (--num == 0)
+            return c1;
+        mul(rp[2], ap[2], w, c1);
+    }
+    return (c1);
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
+{
+    if (n <= 0)
+        return;
+
+    while (n & ~3) {
+        sqr(r[0], r[1], a[0]);
+        sqr(r[2], r[3], a[1]);
+        sqr(r[4], r[5], a[2]);
+        sqr(r[6], r[7], a[3]);
+        a += 4;
+        r += 8;
+        n -= 4;
+    }
+    if (n) {
+        sqr(r[0], r[1], a[0]);
+        if (--n == 0)
+            return;
+        sqr(r[2], r[3], a[1]);
+        if (--n == 0)
+            return;
+        sqr(r[4], r[5], a[2]);
+    }
+}
+
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
+{
+    BN_ULONG ret, waste;
+
+ asm("divq      %4":"=a"(ret), "=d"(waste)
+ :     "a"(l), "d"(h), "g"(d)
+ :     "cc");
+
+    return ret;
+}
+
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      int n)
+{
+    BN_ULONG ret;
+    size_t i = 0;
+
+    if (n <= 0)
+        return 0;
+
+    asm volatile ("       subq    %0,%0           \n" /* clear carry */
+                  "       jmp     1f              \n"
+                  ".p2align 4                     \n"
+                  "1:     movq    (%4,%2,8),%0    \n"
+                  "       adcq    (%5,%2,8),%0    \n"
+                  "       movq    %0,(%3,%2,8)    \n"
+                  "       lea     1(%2),%2        \n"
+                  "       loop    1b              \n"
+                  "       sbbq    %0,%0           \n":"=&r" (ret), "+c"(n),
+                  "+r"(i)
+                  :"r"(rp), "r"(ap), "r"(bp)
+                  :"cc", "memory");
+
+    return ret & 1;
+}
+
+# ifndef SIMICS
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      int n)
+{
+    BN_ULONG ret;
+    size_t i = 0;
+
+    if (n <= 0)
+        return 0;
+
+    asm volatile ("       subq    %0,%0           \n" /* clear borrow */
+                  "       jmp     1f              \n"
+                  ".p2align 4                     \n"
+                  "1:     movq    (%4,%2,8),%0    \n"
+                  "       sbbq    (%5,%2,8),%0    \n"
+                  "       movq    %0,(%3,%2,8)    \n"
+                  "       lea     1(%2),%2        \n"
+                  "       loop    1b              \n"
+                  "       sbbq    %0,%0           \n":"=&r" (ret), "+c"(n),
+                  "+r"(i)
+                  :"r"(rp), "r"(ap), "r"(bp)
+                  :"cc", "memory");
+
+    return ret & 1;
+}
+# else
+/* Simics 1.4<7 has buggy sbbq:-( */
+#  define BN_MASK2 0xffffffffffffffffL
+BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+{
+    BN_ULONG t1, t2;
+    int c = 0;
+
+    if (n <= 0)
+        return ((BN_ULONG)0);
+
+    for (;;) {
+        t1 = a[0];
+        t2 = b[0];
+        r[0] = (t1 - t2 - c) & BN_MASK2;
+        if (t1 != t2)
+            c = (t1 < t2);
+        if (--n <= 0)
+            break;
+
+        t1 = a[1];
+        t2 = b[1];
+        r[1] = (t1 - t2 - c) & BN_MASK2;
+        if (t1 != t2)
+            c = (t1 < t2);
+        if (--n <= 0)
+            break;
+
+        t1 = a[2];
+        t2 = b[2];
+        r[2] = (t1 - t2 - c) & BN_MASK2;
+        if (t1 != t2)
+            c = (t1 < t2);
+        if (--n <= 0)
+            break;
+
+        t1 = a[3];
+        t2 = b[3];
+        r[3] = (t1 - t2 - c) & BN_MASK2;
+        if (t1 != t2)
+            c = (t1 < t2);
+        if (--n <= 0)
+            break;
+
+        a += 4;
+        b += 4;
+        r += 4;
+    }
+    return (c);
+}
+# endif
+
+/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
+/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
+/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
+/*
+ * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
+ * c=(c2,c1,c0)
+ */
+
+/*
+ * Keep in mind that carrying into high part of multiplication result
+ * can not overflow, because it cannot be all-ones.
+ */
+# if 0
+/* original macros are kept for reference purposes */
+#  define mul_add_c(a,b,c0,c1,c2)       do {    \
+        BN_ULONG ta = (a), tb = (b);            \
+        BN_ULONG lo, hi;                        \
+        BN_UMULT_LOHI(lo,hi,ta,tb);             \
+        c0 += lo; hi += (c0<lo)?1:0;            \
+        c1 += hi; c2 += (c1<hi)?1:0;            \
+        } while(0)
+
+#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
+        BN_ULONG ta = (a), tb = (b);            \
+        BN_ULONG lo, hi, tt;                    \
+        BN_UMULT_LOHI(lo,hi,ta,tb);             \
+        c0 += lo; tt = hi+((c0<lo)?1:0);        \
+        c1 += tt; c2 += (c1<tt)?1:0;            \
+        c0 += lo; hi += (c0<lo)?1:0;            \
+        c1 += hi; c2 += (c1<hi)?1:0;            \
+        } while(0)
+
+#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
+        BN_ULONG ta = (a)[i];                   \
+        BN_ULONG lo, hi;                        \
+        BN_UMULT_LOHI(lo,hi,ta,ta);             \
+        c0 += lo; hi += (c0<lo)?1:0;            \
+        c1 += hi; c2 += (c1<hi)?1:0;            \
+        } while(0)
+# else
+#  define mul_add_c(a,b,c0,c1,c2) do {  \
+        BN_ULONG t1,t2;                 \
+        asm ("mulq %3"                  \
+                : "=a"(t1),"=d"(t2)     \
+                : "a"(a),"m"(b)         \
+                : "cc");                \
+        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
+                : "+r"(c0),"+r"(c1),"+r"(c2)            \
+                : "r"(t1),"r"(t2),"g"(0)                \
+                : "cc");                                \
+        } while (0)
+
+#  define sqr_add_c(a,i,c0,c1,c2) do {  \
+        BN_ULONG t1,t2;                 \
+        asm ("mulq %2"                  \
+                : "=a"(t1),"=d"(t2)     \
+                : "a"(a[i])             \
+                : "cc");                \
+        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
+                : "+r"(c0),"+r"(c1),"+r"(c2)            \
+                : "r"(t1),"r"(t2),"g"(0)                \
+                : "cc");                                \
+        } while (0)
+
+#  define mul_add_c2(a,b,c0,c1,c2) do { \
+        BN_ULONG t1,t2;                 \
+        asm ("mulq %3"                  \
+                : "=a"(t1),"=d"(t2)     \
+                : "a"(a),"m"(b)         \
+                : "cc");                \
+        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
+                : "+r"(c0),"+r"(c1),"+r"(c2)            \
+                : "r"(t1),"r"(t2),"g"(0)                \
+                : "cc");                                \
+        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
+                : "+r"(c0),"+r"(c1),"+r"(c2)            \
+                : "r"(t1),"r"(t2),"g"(0)                \
+                : "cc");                                \
+        } while (0)
+# endif
+
+# define sqr_add_c2(a,i,j,c0,c1,c2)      \
+        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
+void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+{
+    BN_ULONG c1, c2, c3;
+
+    c1 = 0;
+    c2 = 0;
+    c3 = 0;
+    mul_add_c(a[0], b[0], c1, c2, c3);
+    r[0] = c1;
+    c1 = 0;
+    mul_add_c(a[0], b[1], c2, c3, c1);
+    mul_add_c(a[1], b[0], c2, c3, c1);
+    r[1] = c2;
+    c2 = 0;
+    mul_add_c(a[2], b[0], c3, c1, c2);
+    mul_add_c(a[1], b[1], c3, c1, c2);
+    mul_add_c(a[0], b[2], c3, c1, c2);
+    r[2] = c3;
+    c3 = 0;
+    mul_add_c(a[0], b[3], c1, c2, c3);
+    mul_add_c(a[1], b[2], c1, c2, c3);
+    mul_add_c(a[2], b[1], c1, c2, c3);
+    mul_add_c(a[3], b[0], c1, c2, c3);
+    r[3] = c1;
+    c1 = 0;
+    mul_add_c(a[4], b[0], c2, c3, c1);
+    mul_add_c(a[3], b[1], c2, c3, c1);
+    mul_add_c(a[2], b[2], c2, c3, c1);
+    mul_add_c(a[1], b[3], c2, c3, c1);
+    mul_add_c(a[0], b[4], c2, c3, c1);
+    r[4] = c2;
+    c2 = 0;
+    mul_add_c(a[0], b[5], c3, c1, c2);
+    mul_add_c(a[1], b[4], c3, c1, c2);
+    mul_add_c(a[2], b[3], c3, c1, c2);
+    mul_add_c(a[3], b[2], c3, c1, c2);
+    mul_add_c(a[4], b[1], c3, c1, c2);
+    mul_add_c(a[5], b[0], c3, c1, c2);
+    r[5] = c3;
+    c3 = 0;
+    mul_add_c(a[6], b[0], c1, c2, c3);
+    mul_add_c(a[5], b[1], c1, c2, c3);
+    mul_add_c(a[4], b[2], c1, c2, c3);
+    mul_add_c(a[3], b[3], c1, c2, c3);
+    mul_add_c(a[2], b[4], c1, c2, c3);
+    mul_add_c(a[1], b[5], c1, c2, c3);
+    mul_add_c(a[0], b[6], c1, c2, c3);
+    r[6] = c1;
+    c1 = 0;
+    mul_add_c(a[0], b[7], c2, c3, c1);
+    mul_add_c(a[1], b[6], c2, c3, c1);
+    mul_add_c(a[2], b[5], c2, c3, c1);
+    mul_add_c(a[3], b[4], c2, c3, c1);
+    mul_add_c(a[4], b[3], c2, c3, c1);
+    mul_add_c(a[5], b[2], c2, c3, c1);
+    mul_add_c(a[6], b[1], c2, c3, c1);
+    mul_add_c(a[7], b[0], c2, c3, c1);
+    r[7] = c2;
+    c2 = 0;
+    mul_add_c(a[7], b[1], c3, c1, c2);
+    mul_add_c(a[6], b[2], c3, c1, c2);
+    mul_add_c(a[5], b[3], c3, c1, c2);
+    mul_add_c(a[4], b[4], c3, c1, c2);
+    mul_add_c(a[3], b[5], c3, c1, c2);
+    mul_add_c(a[2], b[6], c3, c1, c2);
+    mul_add_c(a[1], b[7], c3, c1, c2);
+    r[8] = c3;
+    c3 = 0;
+    mul_add_c(a[2], b[7], c1, c2, c3);
+    mul_add_c(a[3], b[6], c1, c2, c3);
+    mul_add_c(a[4], b[5], c1, c2, c3);
+    mul_add_c(a[5], b[4], c1, c2, c3);
+    mul_add_c(a[6], b[3], c1, c2, c3);
+    mul_add_c(a[7], b[2], c1, c2, c3);
+    r[9] = c1;
+    c1 = 0;
+    mul_add_c(a[7], b[3], c2, c3, c1);
+    mul_add_c(a[6], b[4], c2, c3, c1);
+    mul_add_c(a[5], b[5], c2, c3, c1);
+    mul_add_c(a[4], b[6], c2, c3, c1);
+    mul_add_c(a[3], b[7], c2, c3, c1);
+    r[10] = c2;
+    c2 = 0;
+    mul_add_c(a[4], b[7], c3, c1, c2);
+    mul_add_c(a[5], b[6], c3, c1, c2);
+    mul_add_c(a[6], b[5], c3, c1, c2);
+    mul_add_c(a[7], b[4], c3, c1, c2);
+    r[11] = c3;
+    c3 = 0;
+    mul_add_c(a[7], b[5], c1, c2, c3);
+    mul_add_c(a[6], b[6], c1, c2, c3);
+    mul_add_c(a[5], b[7], c1, c2, c3);
+    r[12] = c1;
+    c1 = 0;
+    mul_add_c(a[6], b[7], c2, c3, c1);
+    mul_add_c(a[7], b[6], c2, c3, c1);
+    r[13] = c2;
+    c2 = 0;
+    mul_add_c(a[7], b[7], c3, c1, c2);
+    r[14] = c3;
+    r[15] = c1;
+}
+
+void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+{
+    BN_ULONG c1, c2, c3;
+
+    c1 = 0;
+    c2 = 0;
+    c3 = 0;
+    mul_add_c(a[0], b[0], c1, c2, c3);
+    r[0] = c1;
+    c1 = 0;
+    mul_add_c(a[0], b[1], c2, c3, c1);
+    mul_add_c(a[1], b[0], c2, c3, c1);
+    r[1] = c2;
+    c2 = 0;
+    mul_add_c(a[2], b[0], c3, c1, c2);
+    mul_add_c(a[1], b[1], c3, c1, c2);
+    mul_add_c(a[0], b[2], c3, c1, c2);
+    r[2] = c3;
+    c3 = 0;
+    mul_add_c(a[0], b[3], c1, c2, c3);
+    mul_add_c(a[1], b[2], c1, c2, c3);
+    mul_add_c(a[2], b[1], c1, c2, c3);
+    mul_add_c(a[3], b[0], c1, c2, c3);
+    r[3] = c1;
+    c1 = 0;
+    mul_add_c(a[3], b[1], c2, c3, c1);
+    mul_add_c(a[2], b[2], c2, c3, c1);
+    mul_add_c(a[1], b[3], c2, c3, c1);
+    r[4] = c2;
+    c2 = 0;
+    mul_add_c(a[2], b[3], c3, c1, c2);
+    mul_add_c(a[3], b[2], c3, c1, c2);
+    r[5] = c3;
+    c3 = 0;
+    mul_add_c(a[3], b[3], c1, c2, c3);
+    r[6] = c1;
+    r[7] = c2;
+}
+
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
+{
+    BN_ULONG c1, c2, c3;
+
+    c1 = 0;
+    c2 = 0;
+    c3 = 0;
+    sqr_add_c(a, 0, c1, c2, c3);
+    r[0] = c1;
+    c1 = 0;
+    sqr_add_c2(a, 1, 0, c2, c3, c1);
+    r[1] = c2;
+    c2 = 0;
+    sqr_add_c(a, 1, c3, c1, c2);
+    sqr_add_c2(a, 2, 0, c3, c1, c2);
+    r[2] = c3;
+    c3 = 0;
+    sqr_add_c2(a, 3, 0, c1, c2, c3);
+    sqr_add_c2(a, 2, 1, c1, c2, c3);
+    r[3] = c1;
+    c1 = 0;
+    sqr_add_c(a, 2, c2, c3, c1);
+    sqr_add_c2(a, 3, 1, c2, c3, c1);
+    sqr_add_c2(a, 4, 0, c2, c3, c1);
+    r[4] = c2;
+    c2 = 0;
+    sqr_add_c2(a, 5, 0, c3, c1, c2);
+    sqr_add_c2(a, 4, 1, c3, c1, c2);
+    sqr_add_c2(a, 3, 2, c3, c1, c2);
+    r[5] = c3;
+    c3 = 0;
+    sqr_add_c(a, 3, c1, c2, c3);
+    sqr_add_c2(a, 4, 2, c1, c2, c3);
+    sqr_add_c2(a, 5, 1, c1, c2, c3);
+    sqr_add_c2(a, 6, 0, c1, c2, c3);
+    r[6] = c1;
+    c1 = 0;
+    sqr_add_c2(a, 7, 0, c2, c3, c1);
+    sqr_add_c2(a, 6, 1, c2, c3, c1);
+    sqr_add_c2(a, 5, 2, c2, c3, c1);
+    sqr_add_c2(a, 4, 3, c2, c3, c1);
+    r[7] = c2;
+    c2 = 0;
+    sqr_add_c(a, 4, c3, c1, c2);
+    sqr_add_c2(a, 5, 3, c3, c1, c2);
+    sqr_add_c2(a, 6, 2, c3, c1, c2);
+    sqr_add_c2(a, 7, 1, c3, c1, c2);
+    r[8] = c3;
+    c3 = 0;
+    sqr_add_c2(a, 7, 2, c1, c2, c3);
+    sqr_add_c2(a, 6, 3, c1, c2, c3);
+    sqr_add_c2(a, 5, 4, c1, c2, c3);
+    r[9] = c1;
+    c1 = 0;
+    sqr_add_c(a, 5, c2, c3, c1);
+    sqr_add_c2(a, 6, 4, c2, c3, c1);
+    sqr_add_c2(a, 7, 3, c2, c3, c1);
+    r[10] = c2;
+    c2 = 0;
+    sqr_add_c2(a, 7, 4, c3, c1, c2);
+    sqr_add_c2(a, 6, 5, c3, c1, c2);
+    r[11] = c3;
+    c3 = 0;
+    sqr_add_c(a, 6, c1, c2, c3);
+    sqr_add_c2(a, 7, 5, c1, c2, c3);
+    r[12] = c1;
+    c1 = 0;
+    sqr_add_c2(a, 7, 6, c2, c3, c1);
+    r[13] = c2;
+    c2 = 0;
+    sqr_add_c(a, 7, c3, c1, c2);
+    r[14] = c3;
+    r[15] = c1;
+}
+
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
+{
+    BN_ULONG c1, c2, c3;
+
+    c1 = 0;
+    c2 = 0;
+    c3 = 0;
+    sqr_add_c(a, 0, c1, c2, c3);
+    r[0] = c1;
+    c1 = 0;
+    sqr_add_c2(a, 1, 0, c2, c3, c1);
+    r[1] = c2;
+    c2 = 0;
+    sqr_add_c(a, 1, c3, c1, c2);
+    sqr_add_c2(a, 2, 0, c3, c1, c2);
+    r[2] = c3;
+    c3 = 0;
+    sqr_add_c2(a, 3, 0, c1, c2, c3);
+    sqr_add_c2(a, 2, 1, c1, c2, c3);
+    r[3] = c1;
+    c1 = 0;
+    sqr_add_c(a, 2, c2, c3, c1);
+    sqr_add_c2(a, 3, 1, c2, c3, c1);
+    r[4] = c2;
+    c2 = 0;
+    sqr_add_c2(a, 3, 2, c3, c1, c2);
+    r[5] = c3;
+    c3 = 0;
+    sqr_add_c(a, 3, c1, c2, c3);
+    r[6] = c1;
+    r[7] = c2;
+}
+#endif
--- a/openssl-1.0.2f/crypto/bn/asm/x86_64-gf2m.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86_64-gf2m.pl
@@ -0,0 +1,390 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and µ-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($lo,$hi)=("%rax","%rdx");	$a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@abi-omnipotent
+.align	16
+_mul_1x1:
+	sub	\$128+8,%rsp
+	mov	\$-1,$a1
+	lea	($a,$a),$i0
+	shr	\$3,$a1
+	lea	(,$a,4),$i1
+	and	$a,$a1			# a1=a&0x1fffffffffffffff
+	lea	(,$a,8),$a8
+	sar	\$63,$a			# broadcast 63rd bit
+	lea	($a1,$a1),$a2
+	sar	\$63,$i0		# broadcast 62nd bit
+	lea	(,$a1,4),$a4
+	and	$b,$a
+	sar	\$63,$i1		# boardcast 61st bit
+	mov	$a,$hi			# $a is $lo
+	shl	\$63,$lo
+	and	$b,$i0
+	shr	\$1,$hi
+	mov	$i0,$t1
+	shl	\$62,$i0
+	and	$b,$i1
+	shr	\$2,$t1
+	xor	$i0,$lo
+	mov	$i1,$t0
+	shl	\$61,$i1
+	xor	$t1,$hi
+	shr	\$3,$t0
+	xor	$i1,$lo
+	xor	$t0,$hi
+
+	mov	$a1,$a12
+	movq	\$0,0(%rsp)		# tab[0]=0
+	xor	$a2,$a12		# a1^a2
+	mov	$a1,8(%rsp)		# tab[1]=a1
+	 mov	$a4,$a48
+	mov	$a2,16(%rsp)		# tab[2]=a2
+	 xor	$a8,$a48		# a4^a8
+	mov	$a12,24(%rsp)		# tab[3]=a1^a2
+
+	xor	$a4,$a1
+	mov	$a4,32(%rsp)		# tab[4]=a4
+	xor	$a4,$a2
+	mov	$a1,40(%rsp)		# tab[5]=a1^a4
+	xor	$a4,$a12
+	mov	$a2,48(%rsp)		# tab[6]=a2^a4
+	 xor	$a48,$a1		# a1^a4^a4^a8=a1^a8
+	mov	$a12,56(%rsp)		# tab[7]=a1^a2^a4
+	 xor	$a48,$a2		# a2^a4^a4^a8=a1^a8
+
+	mov	$a8,64(%rsp)		# tab[8]=a8
+	xor	$a48,$a12		# a1^a2^a4^a4^a8=a1^a2^a8
+	mov	$a1,72(%rsp)		# tab[9]=a1^a8
+	 xor	$a4,$a1			# a1^a8^a4
+	mov	$a2,80(%rsp)		# tab[10]=a2^a8
+	 xor	$a4,$a2			# a2^a8^a4
+	mov	$a12,88(%rsp)		# tab[11]=a1^a2^a8
+
+	xor	$a4,$a12		# a1^a2^a8^a4
+	mov	$a48,96(%rsp)		# tab[12]=a4^a8
+	 mov	$mask,$i0
+	mov	$a1,104(%rsp)		# tab[13]=a1^a4^a8
+	 and	$b,$i0
+	mov	$a2,112(%rsp)		# tab[14]=a2^a4^a8
+	 shr	\$4,$b
+	mov	$a12,120(%rsp)		# tab[15]=a1^a2^a4^a8
+	 mov	$mask,$i1
+	 and	$b,$i1
+	 shr	\$4,$b
+
+	movq	(%rsp,$i0,8),$R		# half of calculations is done in SSE2
+	mov	$mask,$i0
+	and	$b,$i0
+	shr	\$4,$b
+___
+    for ($n=1;$n<8;$n++) {
+	$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$mask,$i1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	and	$b,$i1
+	 movq	(%rsp,$i0,8),$Tx
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	 pslldq	\$$n,$Tx
+	 mov	$mask,$i0
+	shr	\$4,$b
+	xor	$t0,$hi
+	 and	$b,$i0
+	 shr	\$4,$b
+	 pxor	$Tx,$R
+___
+    }
+$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	movq	$R,$i0
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	psrldq	\$8,$R
+	xor	$t0,$hi
+	movq	$R,$i1
+	xor	$i0,$lo
+	xor	$i1,$hi
+
+	add	\$128+8,%rsp
+	ret
+.Lend_mul_1x1:
+.size	_mul_1x1,.-_mul_1x1
+___
+
+($rp,$a1,$a0,$b1,$b0) = $win64?	("%rcx","%rdx","%r8", "%r9","%r10") :	# Win64 order
+				("%rdi","%rsi","%rdx","%rcx","%r8");	# Unix order
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@abi-omnipotent
+.align	16
+bn_GF2m_mul_2x2:
+	mov	OPENSSL_ia32cap_P(%rip),%rax
+	bt	\$33,%rax
+	jnc	.Lvanilla_mul_2x2
+
+	movq		$a1,%xmm0
+	movq		$b1,%xmm1
+	movq		$a0,%xmm2
+___
+$code.=<<___ if ($win64);
+	movq		40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+	movq		$b0,%xmm3
+___
+$code.=<<___;
+	movdqa		%xmm0,%xmm4
+	movdqa		%xmm1,%xmm5
+	pclmulqdq	\$0,%xmm1,%xmm0	# a1·b1
+	pxor		%xmm2,%xmm4
+	pxor		%xmm3,%xmm5
+	pclmulqdq	\$0,%xmm3,%xmm2	# a0·b0
+	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)·(b0+b1)
+	xorps		%xmm0,%xmm4
+	xorps		%xmm2,%xmm4	# (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	movdqa		%xmm4,%xmm5
+	pslldq		\$8,%xmm4
+	psrldq		\$8,%xmm5
+	pxor		%xmm4,%xmm2
+	pxor		%xmm5,%xmm0
+	movdqu		%xmm2,0($rp)
+	movdqu		%xmm0,16($rp)
+	ret
+
+.align	16
+.Lvanilla_mul_2x2:
+	lea	-8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	mov	`8*17+40`(%rsp),$b0
+	mov	%rdi,8*15(%rsp)
+	mov	%rsi,8*16(%rsp)
+___
+$code.=<<___;
+	mov	%r14,8*10(%rsp)
+	mov	%r13,8*11(%rsp)
+	mov	%r12,8*12(%rsp)
+	mov	%rbp,8*13(%rsp)
+	mov	%rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+	mov	$rp,32(%rsp)		# save the arguments
+	mov	$a1,40(%rsp)
+	mov	$a0,48(%rsp)
+	mov	$b1,56(%rsp)
+	mov	$b0,64(%rsp)
+
+	mov	\$0xf,$mask
+	mov	$a1,$a
+	mov	$b1,$b
+	call	_mul_1x1		# a1·b1
+	mov	$lo,16(%rsp)
+	mov	$hi,24(%rsp)
+
+	mov	48(%rsp),$a
+	mov	64(%rsp),$b
+	call	_mul_1x1		# a0·b0
+	mov	$lo,0(%rsp)
+	mov	$hi,8(%rsp)
+
+	mov	40(%rsp),$a
+	mov	56(%rsp),$b
+	xor	48(%rsp),$a
+	xor	64(%rsp),$b
+	call	_mul_1x1		# (a0+a1)·(b0+b1)
+___
+	@r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+	mov	0(%rsp),@r[0]
+	mov	8(%rsp),@r[1]
+	mov	16(%rsp),@r[2]
+	mov	24(%rsp),@r[3]
+	mov	32(%rsp),%rbp
+
+	xor	$hi,$lo
+	xor	@r[1],$hi
+	xor	@r[0],$lo
+	mov	@r[0],0(%rbp)
+	xor	@r[2],$hi
+	mov	@r[3],24(%rbp)
+	xor	@r[3],$lo
+	xor	@r[3],$hi
+	xor	$hi,$lo
+	mov	$hi,16(%rbp)
+	mov	$lo,8(%rbp)
+
+	mov	8*10(%rsp),%r14
+	mov	8*11(%rsp),%r13
+	mov	8*12(%rsp),%r12
+	mov	8*13(%rsp),%rbp
+	mov	8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+	mov	8*15(%rsp),%rdi
+	mov	8*16(%rsp),%rsi
+___
+$code.=<<___;
+	lea	8*17(%rsp),%rsp
+	ret
+.Lend_mul_2x2:
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz	"GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody_mul_2x2(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lin_prologue
+
+	mov	8*10(%rax),%r14		# mimic epilogue
+	mov	8*11(%rax),%r13
+	mov	8*12(%rax),%r12
+	mov	8*13(%rax),%rbp
+	mov	8*14(%rax),%rbx
+	mov	8*15(%rax),%rdi
+	mov	8*16(%rax),%rsi
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+
+.Lin_prologue:
+	lea	8*17(%rax),%rax
+	mov	%rax,152($context)	# restore context->Rsp
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	_mul_1x1
+	.rva	.Lend_mul_1x1
+	.rva	.LSEH_info_1x1
+
+	.rva	.Lvanilla_mul_2x2
+	.rva	.Lend_mul_2x2
+	.rva	.LSEH_info_2x2
+.section	.xdata
+.align	8
+.LSEH_info_1x1:
+	.byte	0x01,0x07,0x02,0x00
+	.byte	0x07,0x01,0x11,0x00	# sub rsp,128+8
+.LSEH_info_2x2:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
--- a/openssl-1.0.2f/crypto/bn/asm/x86_64-mont.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86_64-mont.pl
--- a/openssl-1.0.2f/crypto/bn/asm/x86_64-mont5.pl
+++ b/openssl-1.0.2f/crypto/bn/asm/x86_64-mont5.pl
--- a/openssl-1.0.2f/crypto/bn/bn.h
+++ b/openssl-1.0.2f/crypto/bn/bn.h
@@ -0,0 +1,939 @@
+/* crypto/bn/bn.h */
+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * Portions of the attached software ("Contribution") are developed by
+ * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
+ *
+ * The Contribution is licensed pursuant to the Eric Young open source
+ * license provided above.
+ *
+ * The binary polynomial arithmetic software is originally written by
+ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems Laboratories.
+ *
+ */
+
+#ifndef HEADER_BN_H
+# define HEADER_BN_H
+
+# include <openssl/e_os2.h>
+# ifndef OPENSSL_NO_FP_API
+#  include <stdio.h>            /* FILE */
+# endif
+# include <openssl/ossl_typ.h>
+# include <openssl/crypto.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/*
+ * These preprocessor symbols control various aspects of the bignum headers
+ * and library code. They're not defined by any "normal" configuration, as
+ * they are intended for development and testing purposes. NB: defining all
+ * three can be useful for debugging application code as well as openssl
+ * itself. BN_DEBUG - turn on various debugging alterations to the bignum
+ * code BN_DEBUG_RAND - uses random poisoning of unused words to trip up
+ * mismanagement of bignum internals. You must also define BN_DEBUG.
+ */
+/* #define BN_DEBUG */
+/* #define BN_DEBUG_RAND */
+
+# ifndef OPENSSL_SMALL_FOOTPRINT
+#  define BN_MUL_COMBA
+#  define BN_SQR_COMBA
+#  define BN_RECURSION
+# endif
+
+/*
+ * This next option uses the C libraries (2 word)/(1 word) function. If it is
+ * not defined, I use my C version (which is slower). The reason for this
+ * flag is that when the particular C compiler library routine is used, and
+ * the library is linked with a different compiler, the library is missing.
+ * This mostly happens when the library is built with gcc and then linked
+ * using normal cc.  This would be a common occurrence because gcc normally
+ * produces code that is 2 times faster than system compilers for the big
+ * number stuff. For machines with only one compiler (or shared libraries),
+ * this should be on.  Again this in only really a problem on machines using
+ * "long long's", are 32bit, and are not using my assembler code.
+ */
+# if defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_WINDOWS) || \
+    defined(OPENSSL_SYS_WIN32) || defined(linux)
+#  ifndef BN_DIV2W
+#   define BN_DIV2W
+#  endif
+# endif
+
+/*
+ * assuming long is 64bit - this is the DEC Alpha unsigned long long is only
+ * 64 bits :-(, don't define BN_LLONG for the DEC Alpha
+ */
+# ifdef SIXTY_FOUR_BIT_LONG
+#  define BN_ULLONG       unsigned long long
+#  define BN_ULONG        unsigned long
+#  define BN_LONG         long
+#  define BN_BITS         128
+#  define BN_BYTES        8
+#  define BN_BITS2        64
+#  define BN_BITS4        32
+#  define BN_MASK         (0xffffffffffffffffffffffffffffffffLL)
+#  define BN_MASK2        (0xffffffffffffffffL)
+#  define BN_MASK2l       (0xffffffffL)
+#  define BN_MASK2h       (0xffffffff00000000L)
+#  define BN_MASK2h1      (0xffffffff80000000L)
+#  define BN_TBIT         (0x8000000000000000L)
+#  define BN_DEC_CONV     (10000000000000000000UL)
+#  define BN_DEC_FMT1     "%lu"
+#  define BN_DEC_FMT2     "%019lu"
+#  define BN_DEC_NUM      19
+#  define BN_HEX_FMT1     "%lX"
+#  define BN_HEX_FMT2     "%016lX"
+# endif
+
+/*
+ * This is where the long long data type is 64 bits, but long is 32. For
+ * machines where there are 64bit registers, this is the mode to use. IRIX,
+ * on R4000 and above should use this mode, along with the relevant assembler
+ * code :-).  Do NOT define BN_LLONG.
+ */
+# ifdef SIXTY_FOUR_BIT
+#  undef BN_LLONG
+#  undef BN_ULLONG
+#  define BN_ULONG        unsigned long long
+#  define BN_LONG         long long
+#  define BN_BITS         128
+#  define BN_BYTES        8
+#  define BN_BITS2        64
+#  define BN_BITS4        32
+#  define BN_MASK2        (0xffffffffffffffffLL)
+#  define BN_MASK2l       (0xffffffffL)
+#  define BN_MASK2h       (0xffffffff00000000LL)
+#  define BN_MASK2h1      (0xffffffff80000000LL)
+#  define BN_TBIT         (0x8000000000000000LL)
+#  define BN_DEC_CONV     (10000000000000000000ULL)
+#  define BN_DEC_FMT1     "%llu"
+#  define BN_DEC_FMT2     "%019llu"
+#  define BN_DEC_NUM      19
+#  define BN_HEX_FMT1     "%llX"
+#  define BN_HEX_FMT2     "%016llX"
+# endif
+
+# ifdef THIRTY_TWO_BIT
+#  ifdef BN_LLONG
+#   if defined(_WIN32) && !defined(__GNUC__)
+#    define BN_ULLONG     unsigned __int64
+#    define BN_MASK       (0xffffffffffffffffI64)
+#   else
+#    define BN_ULLONG     unsigned long long
+#    define BN_MASK       (0xffffffffffffffffLL)
+#   endif
+#  endif
+#  define BN_ULONG        unsigned int
+#  define BN_LONG         int
+#  define BN_BITS         64
+#  define BN_BYTES        4
+#  define BN_BITS2        32
+#  define BN_BITS4        16
+#  define BN_MASK2        (0xffffffffL)
+#  define BN_MASK2l       (0xffff)
+#  define BN_MASK2h1      (0xffff8000L)
+#  define BN_MASK2h       (0xffff0000L)
+#  define BN_TBIT         (0x80000000L)
+#  define BN_DEC_CONV     (1000000000L)
+#  define BN_DEC_FMT1     "%u"
+#  define BN_DEC_FMT2     "%09u"
+#  define BN_DEC_NUM      9
+#  define BN_HEX_FMT1     "%X"
+#  define BN_HEX_FMT2     "%08X"
+# endif
+
+# define BN_DEFAULT_BITS 1280
+
+# define BN_FLG_MALLOCED         0x01
+# define BN_FLG_STATIC_DATA      0x02
+
+/*
+ * avoid leaking exponent information through timing,
+ * BN_mod_exp_mont() will call BN_mod_exp_mont_consttime,
+ * BN_div() will call BN_div_no_branch,
+ * BN_mod_inverse() will call BN_mod_inverse_no_branch.
+ */
+# define BN_FLG_CONSTTIME        0x04
+
+# ifdef OPENSSL_NO_DEPRECATED
+/* deprecated name for the flag */
+#  define BN_FLG_EXP_CONSTTIME BN_FLG_CONSTTIME
+/*
+ * avoid leaking exponent information through timings
+ * (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime)
+ */
+# endif
+
+# ifndef OPENSSL_NO_DEPRECATED
+#  define BN_FLG_FREE             0x8000
+                                       /* used for debuging */
+# endif
+# define BN_set_flags(b,n)       ((b)->flags|=(n))
+# define BN_get_flags(b,n)       ((b)->flags&(n))
+
+/*
+ * get a clone of a BIGNUM with changed flags, for *temporary* use only (the
+ * two BIGNUMs cannot not be used in parallel!)
+ */
+# define BN_with_flags(dest,b,n)  ((dest)->d=(b)->d, \
+                                  (dest)->top=(b)->top, \
+                                  (dest)->dmax=(b)->dmax, \
+                                  (dest)->neg=(b)->neg, \
+                                  (dest)->flags=(((dest)->flags & BN_FLG_MALLOCED) \
+                                                 |  ((b)->flags & ~BN_FLG_MALLOCED) \
+                                                 |  BN_FLG_STATIC_DATA \
+                                                 |  (n)))
+
+/* Already declared in ossl_typ.h */
+# if 0
+typedef struct bignum_st BIGNUM;
+/* Used for temp variables (declaration hidden in bn_lcl.h) */
+typedef struct bignum_ctx BN_CTX;
+typedef struct bn_blinding_st BN_BLINDING;
+typedef struct bn_mont_ctx_st BN_MONT_CTX;
+typedef struct bn_recp_ctx_st BN_RECP_CTX;
+typedef struct bn_gencb_st BN_GENCB;
+# endif
+
+struct bignum_st {
+    BN_ULONG *d;                /* Pointer to an array of 'BN_BITS2' bit
+                                 * chunks. */
+    int top;                    /* Index of last used d +1. */
+    /* The next are internal book keeping for bn_expand. */
+    int dmax;                   /* Size of the d array. */
+    int neg;                    /* one if the number is negative */
+    int flags;
+};
+
+/* Used for montgomery multiplication */
+struct bn_mont_ctx_st {
+    int ri;                     /* number of bits in R */
+    BIGNUM RR;                  /* used to convert to montgomery form */
+    BIGNUM N;                   /* The modulus */
+    BIGNUM Ni;                  /* R*(1/R mod N) - N*Ni = 1 (Ni is only
+                                 * stored for bignum algorithm) */
+    BN_ULONG n0[2];             /* least significant word(s) of Ni; (type
+                                 * changed with 0.9.9, was "BN_ULONG n0;"
+                                 * before) */
+    int flags;
+};
+
+/*
+ * Used for reciprocal division/mod functions It cannot be shared between
+ * threads
+ */
+struct bn_recp_ctx_st {
+    BIGNUM N;                   /* the divisor */
+    BIGNUM Nr;                  /* the reciprocal */
+    int num_bits;
+    int shift;
+    int flags;
+};
+
+/* Used for slow "generation" functions. */
+struct bn_gencb_st {
+    unsigned int ver;           /* To handle binary (in)compatibility */
+    void *arg;                  /* callback-specific data */
+    union {
+        /* if(ver==1) - handles old style callbacks */
+        void (*cb_1) (int, int, void *);
+        /* if(ver==2) - new callback style */
+        int (*cb_2) (int, int, BN_GENCB *);
+    } cb;
+};
+/* Wrapper function to make using BN_GENCB easier,  */
+int BN_GENCB_call(BN_GENCB *cb, int a, int b);
+/* Macro to populate a BN_GENCB structure with an "old"-style callback */
+# define BN_GENCB_set_old(gencb, callback, cb_arg) { \
+                BN_GENCB *tmp_gencb = (gencb); \
+                tmp_gencb->ver = 1; \
+                tmp_gencb->arg = (cb_arg); \
+                tmp_gencb->cb.cb_1 = (callback); }
+/* Macro to populate a BN_GENCB structure with a "new"-style callback */
+# define BN_GENCB_set(gencb, callback, cb_arg) { \
+                BN_GENCB *tmp_gencb = (gencb); \
+                tmp_gencb->ver = 2; \
+                tmp_gencb->arg = (cb_arg); \
+                tmp_gencb->cb.cb_2 = (callback); }
+
+# define BN_prime_checks 0      /* default: select number of iterations based
+                                 * on the size of the number */
+
+/*
+ * number of Miller-Rabin iterations for an error rate of less than 2^-80 for
+ * random 'b'-bit input, b >= 100 (taken from table 4.4 in the Handbook of
+ * Applied Cryptography [Menezes, van Oorschot, Vanstone; CRC Press 1996];
+ * original paper: Damgaard, Landrock, Pomerance: Average case error
+ * estimates for the strong probable prime test. -- Math. Comp. 61 (1993)
+ * 177-194)
+ */
+# define BN_prime_checks_for_size(b) ((b) >= 1300 ?  2 : \
+                                (b) >=  850 ?  3 : \
+                                (b) >=  650 ?  4 : \
+                                (b) >=  550 ?  5 : \
+                                (b) >=  450 ?  6 : \
+                                (b) >=  400 ?  7 : \
+                                (b) >=  350 ?  8 : \
+                                (b) >=  300 ?  9 : \
+                                (b) >=  250 ? 12 : \
+                                (b) >=  200 ? 15 : \
+                                (b) >=  150 ? 18 : \
+                                /* b >= 100 */ 27)
+
+# define BN_num_bytes(a) ((BN_num_bits(a)+7)/8)
+
+/* Note that BN_abs_is_word didn't work reliably for w == 0 until 0.9.8 */
+# define BN_abs_is_word(a,w) ((((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w))) || \
+                                (((w) == 0) && ((a)->top == 0)))
+# define BN_is_zero(a)       ((a)->top == 0)
+# define BN_is_one(a)        (BN_abs_is_word((a),1) && !(a)->neg)
+# define BN_is_word(a,w)     (BN_abs_is_word((a),(w)) && (!(w) || !(a)->neg))
+# define BN_is_odd(a)        (((a)->top > 0) && ((a)->d[0] & 1))
+
+# define BN_one(a)       (BN_set_word((a),1))
+# define BN_zero_ex(a) \
+        do { \
+                BIGNUM *_tmp_bn = (a); \
+                _tmp_bn->top = 0; \
+                _tmp_bn->neg = 0; \
+        } while(0)
+# ifdef OPENSSL_NO_DEPRECATED
+#  define BN_zero(a)      BN_zero_ex(a)
+# else
+#  define BN_zero(a)      (BN_set_word((a),0))
+# endif
+
+const BIGNUM *BN_value_one(void);
+char *BN_options(void);
+BN_CTX *BN_CTX_new(void);
+# ifndef OPENSSL_NO_DEPRECATED
+void BN_CTX_init(BN_CTX *c);
+# endif
+void BN_CTX_free(BN_CTX *c);
+void BN_CTX_start(BN_CTX *ctx);
+BIGNUM *BN_CTX_get(BN_CTX *ctx);
+void BN_CTX_end(BN_CTX *ctx);
+int BN_rand(BIGNUM *rnd, int bits, int top, int bottom);
+int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom);
+int BN_rand_range(BIGNUM *rnd, const BIGNUM *range);
+int BN_pseudo_rand_range(BIGNUM *rnd, const BIGNUM *range);
+int BN_num_bits(const BIGNUM *a);
+int BN_num_bits_word(BN_ULONG);
+BIGNUM *BN_new(void);
+void BN_init(BIGNUM *);
+void BN_clear_free(BIGNUM *a);
+BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b);
+void BN_swap(BIGNUM *a, BIGNUM *b);
+BIGNUM *BN_bin2bn(const unsigned char *s, int len, BIGNUM *ret);
+int BN_bn2bin(const BIGNUM *a, unsigned char *to);
+BIGNUM *BN_mpi2bn(const unsigned char *s, int len, BIGNUM *ret);
+int BN_bn2mpi(const BIGNUM *a, unsigned char *to);
+int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
+/** BN_set_negative sets sign of a BIGNUM
+ * \param  b  pointer to the BIGNUM object
+ * \param  n  0 if the BIGNUM b should be positive and a value != 0 otherwise
+ */
+void BN_set_negative(BIGNUM *b, int n);
+/** BN_is_negative returns 1 if the BIGNUM is negative
+ * \param  a  pointer to the BIGNUM object
+ * \return 1 if a < 0 and 0 otherwise
+ */
+# define BN_is_negative(a) ((a)->neg != 0)
+
+int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
+           BN_CTX *ctx);
+# define BN_mod(rem,m,d,ctx) BN_div(NULL,(rem),(m),(d),(ctx))
+int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx);
+int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx);
+int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                     const BIGNUM *m);
+int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx);
+int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                     const BIGNUM *m);
+int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx);
+int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
+int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
+int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m);
+int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                  BN_CTX *ctx);
+int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m);
+
+BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w);
+BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w);
+int BN_mul_word(BIGNUM *a, BN_ULONG w);
+int BN_add_word(BIGNUM *a, BN_ULONG w);
+int BN_sub_word(BIGNUM *a, BN_ULONG w);
+int BN_set_word(BIGNUM *a, BN_ULONG w);
+BN_ULONG BN_get_word(const BIGNUM *a);
+
+int BN_cmp(const BIGNUM *a, const BIGNUM *b);
+void BN_free(BIGNUM *a);
+int BN_is_bit_set(const BIGNUM *a, int n);
+int BN_lshift(BIGNUM *r, const BIGNUM *a, int n);
+int BN_lshift1(BIGNUM *r, const BIGNUM *a);
+int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+
+int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+               const BIGNUM *m, BN_CTX *ctx);
+int BN_mod_exp_mont(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+                    const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+                              const BIGNUM *m, BN_CTX *ctx,
+                              BN_MONT_CTX *in_mont);
+int BN_mod_exp_mont_word(BIGNUM *r, BN_ULONG a, const BIGNUM *p,
+                         const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int BN_mod_exp2_mont(BIGNUM *r, const BIGNUM *a1, const BIGNUM *p1,
+                     const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
+                     BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+                      const BIGNUM *m, BN_CTX *ctx);
+
+int BN_mask_bits(BIGNUM *a, int n);
+# ifndef OPENSSL_NO_FP_API
+int BN_print_fp(FILE *fp, const BIGNUM *a);
+# endif
+# ifdef HEADER_BIO_H
+int BN_print(BIO *fp, const BIGNUM *a);
+# else
+int BN_print(void *fp, const BIGNUM *a);
+# endif
+int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx);
+int BN_rshift(BIGNUM *r, const BIGNUM *a, int n);
+int BN_rshift1(BIGNUM *r, const BIGNUM *a);
+void BN_clear(BIGNUM *a);
+BIGNUM *BN_dup(const BIGNUM *a);
+int BN_ucmp(const BIGNUM *a, const BIGNUM *b);
+int BN_set_bit(BIGNUM *a, int n);
+int BN_clear_bit(BIGNUM *a, int n);
+char *BN_bn2hex(const BIGNUM *a);
+char *BN_bn2dec(const BIGNUM *a);
+int BN_hex2bn(BIGNUM **a, const char *str);
+int BN_dec2bn(BIGNUM **a, const char *str);
+int BN_asc2bn(BIGNUM **a, const char *str);
+int BN_gcd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+int BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx); /* returns
+                                                                  * -2 for
+                                                                  * error */
+BIGNUM *BN_mod_inverse(BIGNUM *ret,
+                       const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
+BIGNUM *BN_mod_sqrt(BIGNUM *ret,
+                    const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
+
+void BN_consttime_swap(BN_ULONG swap, BIGNUM *a, BIGNUM *b, int nwords);
+
+/* Deprecated versions */
+# ifndef OPENSSL_NO_DEPRECATED
+BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe,
+                          const BIGNUM *add, const BIGNUM *rem,
+                          void (*callback) (int, int, void *), void *cb_arg);
+int BN_is_prime(const BIGNUM *p, int nchecks,
+                void (*callback) (int, int, void *),
+                BN_CTX *ctx, void *cb_arg);
+int BN_is_prime_fasttest(const BIGNUM *p, int nchecks,
+                         void (*callback) (int, int, void *), BN_CTX *ctx,
+                         void *cb_arg, int do_trial_division);
+# endif                         /* !defined(OPENSSL_NO_DEPRECATED) */
+
+/* Newer versions */
+int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
+                         const BIGNUM *rem, BN_GENCB *cb);
+int BN_is_prime_ex(const BIGNUM *p, int nchecks, BN_CTX *ctx, BN_GENCB *cb);
+int BN_is_prime_fasttest_ex(const BIGNUM *p, int nchecks, BN_CTX *ctx,
+                            int do_trial_division, BN_GENCB *cb);
+
+int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
+
+int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+                            const BIGNUM *Xp, const BIGNUM *Xp1,
+                            const BIGNUM *Xp2, const BIGNUM *e, BN_CTX *ctx,
+                            BN_GENCB *cb);
+int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2, BIGNUM *Xp1,
+                              BIGNUM *Xp2, const BIGNUM *Xp, const BIGNUM *e,
+                              BN_CTX *ctx, BN_GENCB *cb);
+
+BN_MONT_CTX *BN_MONT_CTX_new(void);
+void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
+int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                          BN_MONT_CTX *mont, BN_CTX *ctx);
+# define BN_to_montgomery(r,a,mont,ctx)  BN_mod_mul_montgomery(\
+        (r),(a),&((mont)->RR),(mont),(ctx))
+int BN_from_montgomery(BIGNUM *r, const BIGNUM *a,
+                       BN_MONT_CTX *mont, BN_CTX *ctx);
+void BN_MONT_CTX_free(BN_MONT_CTX *mont);
+int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx);
+BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from);
+BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
+                                    const BIGNUM *mod, BN_CTX *ctx);
+
+/* BN_BLINDING flags */
+# define BN_BLINDING_NO_UPDATE   0x00000001
+# define BN_BLINDING_NO_RECREATE 0x00000002
+
+BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod);
+void BN_BLINDING_free(BN_BLINDING *b);
+int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx);
+int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
+int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
+int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *);
+int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b,
+                          BN_CTX *);
+# ifndef OPENSSL_NO_DEPRECATED
+unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *);
+void BN_BLINDING_set_thread_id(BN_BLINDING *, unsigned long);
+# endif
+CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *);
+unsigned long BN_BLINDING_get_flags(const BN_BLINDING *);
+void BN_BLINDING_set_flags(BN_BLINDING *, unsigned long);
+BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
+                                      const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
+                                      int (*bn_mod_exp) (BIGNUM *r,
+                                                         const BIGNUM *a,
+                                                         const BIGNUM *p,
+                                                         const BIGNUM *m,
+                                                         BN_CTX *ctx,
+                                                         BN_MONT_CTX *m_ctx),
+                                      BN_MONT_CTX *m_ctx);
+
+# ifndef OPENSSL_NO_DEPRECATED
+void BN_set_params(int mul, int high, int low, int mont);
+int BN_get_params(int which);   /* 0, mul, 1 high, 2 low, 3 mont */
+# endif
+
+void BN_RECP_CTX_init(BN_RECP_CTX *recp);
+BN_RECP_CTX *BN_RECP_CTX_new(void);
+void BN_RECP_CTX_free(BN_RECP_CTX *recp);
+int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *rdiv, BN_CTX *ctx);
+int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
+                          BN_RECP_CTX *recp, BN_CTX *ctx);
+int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+                    const BIGNUM *m, BN_CTX *ctx);
+int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
+                BN_RECP_CTX *recp, BN_CTX *ctx);
+
+# ifndef OPENSSL_NO_EC2M
+
+/*
+ * Functions for arithmetic over binary polynomials represented by BIGNUMs.
+ * The BIGNUM::neg property of BIGNUMs representing binary polynomials is
+ * ignored. Note that input arguments are not const so that their bit arrays
+ * can be expanded to the appropriate size if needed.
+ */
+
+/*
+ * r = a + b
+ */
+int BN_GF2m_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+#  define BN_GF2m_sub(r, a, b) BN_GF2m_add(r, a, b)
+/*
+ * r=a mod p
+ */
+int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p);
+/* r = (a * b) mod p */
+int BN_GF2m_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                    const BIGNUM *p, BN_CTX *ctx);
+/* r = (a * a) mod p */
+int BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+/* r = (1 / b) mod p */
+int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *b, const BIGNUM *p, BN_CTX *ctx);
+/* r = (a / b) mod p */
+int BN_GF2m_mod_div(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                    const BIGNUM *p, BN_CTX *ctx);
+/* r = (a ^ b) mod p */
+int BN_GF2m_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                    const BIGNUM *p, BN_CTX *ctx);
+/* r = sqrt(a) mod p */
+int BN_GF2m_mod_sqrt(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+                     BN_CTX *ctx);
+/* r^2 + r = a mod p */
+int BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+                           BN_CTX *ctx);
+#  define BN_GF2m_cmp(a, b) BN_ucmp((a), (b))
+/*-
+ * Some functions allow for representation of the irreducible polynomials
+ * as an unsigned int[], say p.  The irreducible f(t) is then of the form:
+ *     t^p[0] + t^p[1] + ... + t^p[k]
+ * where m = p[0] > p[1] > ... > p[k] = 0.
+ */
+/* r = a mod p */
+int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]);
+/* r = (a * b) mod p */
+int BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                        const int p[], BN_CTX *ctx);
+/* r = (a * a) mod p */
+int BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[],
+                        BN_CTX *ctx);
+/* r = (1 / b) mod p */
+int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *b, const int p[],
+                        BN_CTX *ctx);
+/* r = (a / b) mod p */
+int BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                        const int p[], BN_CTX *ctx);
+/* r = (a ^ b) mod p */
+int BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                        const int p[], BN_CTX *ctx);
+/* r = sqrt(a) mod p */
+int BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a,
+                         const int p[], BN_CTX *ctx);
+/* r^2 + r = a mod p */
+int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
+                               const int p[], BN_CTX *ctx);
+int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
+int BN_GF2m_arr2poly(const int p[], BIGNUM *a);
+
+# endif
+
+/*
+ * faster mod functions for the 'NIST primes' 0 <= a < p^2
+ */
+int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+
+const BIGNUM *BN_get0_nist_prime_192(void);
+const BIGNUM *BN_get0_nist_prime_224(void);
+const BIGNUM *BN_get0_nist_prime_256(void);
+const BIGNUM *BN_get0_nist_prime_384(void);
+const BIGNUM *BN_get0_nist_prime_521(void);
+
+/* library internal functions */
+
+# define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
+        (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2))
+# define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
+BIGNUM *bn_expand2(BIGNUM *a, int words);
+# ifndef OPENSSL_NO_DEPRECATED
+BIGNUM *bn_dup_expand(const BIGNUM *a, int words); /* unused */
+# endif
+
+/*-
+ * Bignum consistency macros
+ * There is one "API" macro, bn_fix_top(), for stripping leading zeroes from
+ * bignum data after direct manipulations on the data. There is also an
+ * "internal" macro, bn_check_top(), for verifying that there are no leading
+ * zeroes. Unfortunately, some auditing is required due to the fact that
+ * bn_fix_top() has become an overabused duct-tape because bignum data is
+ * occasionally passed around in an inconsistent state. So the following
+ * changes have been made to sort this out;
+ * - bn_fix_top()s implementation has been moved to bn_correct_top()
+ * - if BN_DEBUG isn't defined, bn_fix_top() maps to bn_correct_top(), and
+ *   bn_check_top() is as before.
+ * - if BN_DEBUG *is* defined;
+ *   - bn_check_top() tries to pollute unused words even if the bignum 'top' is
+ *     consistent. (ed: only if BN_DEBUG_RAND is defined)
+ *   - bn_fix_top() maps to bn_check_top() rather than "fixing" anything.
+ * The idea is to have debug builds flag up inconsistent bignums when they
+ * occur. If that occurs in a bn_fix_top(), we examine the code in question; if
+ * the use of bn_fix_top() was appropriate (ie. it follows directly after code
+ * that manipulates the bignum) it is converted to bn_correct_top(), and if it
+ * was not appropriate, we convert it permanently to bn_check_top() and track
+ * down the cause of the bug. Eventually, no internal code should be using the
+ * bn_fix_top() macro. External applications and libraries should try this with
+ * their own code too, both in terms of building against the openssl headers
+ * with BN_DEBUG defined *and* linking with a version of OpenSSL built with it
+ * defined. This not only improves external code, it provides more test
+ * coverage for openssl's own code.
+ */
+
+# ifdef BN_DEBUG
+
+/* We only need assert() when debugging */
+#  include <assert.h>
+
+#  ifdef BN_DEBUG_RAND
+/* To avoid "make update" cvs wars due to BN_DEBUG, use some tricks */
+#   ifndef RAND_pseudo_bytes
+int RAND_pseudo_bytes(unsigned char *buf, int num);
+#    define BN_DEBUG_TRIX
+#   endif
+#   define bn_pollute(a) \
+        do { \
+                const BIGNUM *_bnum1 = (a); \
+                if(_bnum1->top < _bnum1->dmax) { \
+                        unsigned char _tmp_char; \
+                        /* We cast away const without the compiler knowing, any \
+                         * *genuinely* constant variables that aren't mutable \
+                         * wouldn't be constructed with top!=dmax. */ \
+                        BN_ULONG *_not_const; \
+                        memcpy(&_not_const, &_bnum1->d, sizeof(BN_ULONG*)); \
+                        /* Debug only - safe to ignore error return */ \
+                        RAND_pseudo_bytes(&_tmp_char, 1); \
+                        memset((unsigned char *)(_not_const + _bnum1->top), _tmp_char, \
+                                (_bnum1->dmax - _bnum1->top) * sizeof(BN_ULONG)); \
+                } \
+        } while(0)
+#   ifdef BN_DEBUG_TRIX
+#    undef RAND_pseudo_bytes
+#   endif
+#  else
+#   define bn_pollute(a)
+#  endif
+#  define bn_check_top(a) \
+        do { \
+                const BIGNUM *_bnum2 = (a); \
+                if (_bnum2 != NULL) { \
+                        assert((_bnum2->top == 0) || \
+                                (_bnum2->d[_bnum2->top - 1] != 0)); \
+                        bn_pollute(_bnum2); \
+                } \
+        } while(0)
+
+#  define bn_fix_top(a)           bn_check_top(a)
+
+#  define bn_check_size(bn, bits) bn_wcheck_size(bn, ((bits+BN_BITS2-1))/BN_BITS2)
+#  define bn_wcheck_size(bn, words) \
+        do { \
+                const BIGNUM *_bnum2 = (bn); \
+                assert((words) <= (_bnum2)->dmax && (words) >= (_bnum2)->top); \
+                /* avoid unused variable warning with NDEBUG */ \
+                (void)(_bnum2); \
+        } while(0)
+
+# else                          /* !BN_DEBUG */
+
+#  define bn_pollute(a)
+#  define bn_check_top(a)
+#  define bn_fix_top(a)           bn_correct_top(a)
+#  define bn_check_size(bn, bits)
+#  define bn_wcheck_size(bn, words)
+
+# endif
+
+# define bn_correct_top(a) \
+        { \
+        BN_ULONG *ftl; \
+        int tmp_top = (a)->top; \
+        if (tmp_top > 0) \
+                { \
+                for (ftl= &((a)->d[tmp_top-1]); tmp_top > 0; tmp_top--) \
+                        if (*(ftl--)) break; \
+                (a)->top = tmp_top; \
+                } \
+        bn_pollute(a); \
+        }
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
+                          BN_ULONG w);
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
+void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      int num);
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      int num);
+
+/* Primes from RFC 2409 */
+BIGNUM *get_rfc2409_prime_768(BIGNUM *bn);
+BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn);
+
+/* Primes from RFC 3526 */
+BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn);
+
+int BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom);
+
+/* BEGIN ERROR CODES */
+/*
+ * The following lines are auto generated by the script mkerr.pl. Any changes
+ * made after this point may be overwritten when the script is next run.
+ */
+void ERR_load_BN_strings(void);
+
+/* Error codes for the BN functions. */
+
+/* Function codes. */
+# define BN_F_BNRAND                                      127
+# define BN_F_BN_BLINDING_CONVERT_EX                      100
+# define BN_F_BN_BLINDING_CREATE_PARAM                    128
+# define BN_F_BN_BLINDING_INVERT_EX                       101
+# define BN_F_BN_BLINDING_NEW                             102
+# define BN_F_BN_BLINDING_UPDATE                          103
+# define BN_F_BN_BN2DEC                                   104
+# define BN_F_BN_BN2HEX                                   105
+# define BN_F_BN_CTX_GET                                  116
+# define BN_F_BN_CTX_NEW                                  106
+# define BN_F_BN_CTX_START                                129
+# define BN_F_BN_DIV                                      107
+# define BN_F_BN_DIV_NO_BRANCH                            138
+# define BN_F_BN_DIV_RECP                                 130
+# define BN_F_BN_EXP                                      123
+# define BN_F_BN_EXPAND2                                  108
+# define BN_F_BN_EXPAND_INTERNAL                          120
+# define BN_F_BN_GF2M_MOD                                 131
+# define BN_F_BN_GF2M_MOD_EXP                             132
+# define BN_F_BN_GF2M_MOD_MUL                             133
+# define BN_F_BN_GF2M_MOD_SOLVE_QUAD                      134
+# define BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR                  135
+# define BN_F_BN_GF2M_MOD_SQR                             136
+# define BN_F_BN_GF2M_MOD_SQRT                            137
+# define BN_F_BN_LSHIFT                                   145
+# define BN_F_BN_MOD_EXP2_MONT                            118
+# define BN_F_BN_MOD_EXP_MONT                             109
+# define BN_F_BN_MOD_EXP_MONT_CONSTTIME                   124
+# define BN_F_BN_MOD_EXP_MONT_WORD                        117
+# define BN_F_BN_MOD_EXP_RECP                             125
+# define BN_F_BN_MOD_EXP_SIMPLE                           126
+# define BN_F_BN_MOD_INVERSE                              110
+# define BN_F_BN_MOD_INVERSE_NO_BRANCH                    139
+# define BN_F_BN_MOD_LSHIFT_QUICK                         119
+# define BN_F_BN_MOD_MUL_RECIPROCAL                       111
+# define BN_F_BN_MOD_SQRT                                 121
+# define BN_F_BN_MPI2BN                                   112
+# define BN_F_BN_NEW                                      113
+# define BN_F_BN_RAND                                     114
+# define BN_F_BN_RAND_RANGE                               122
+# define BN_F_BN_RSHIFT                                   146
+# define BN_F_BN_USUB                                     115
+
+/* Reason codes. */
+# define BN_R_ARG2_LT_ARG3                                100
+# define BN_R_BAD_RECIPROCAL                              101
+# define BN_R_BIGNUM_TOO_LONG                             114
+# define BN_R_BITS_TOO_SMALL                              118
+# define BN_R_CALLED_WITH_EVEN_MODULUS                    102
+# define BN_R_DIV_BY_ZERO                                 103
+# define BN_R_ENCODING_ERROR                              104
+# define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA                105
+# define BN_R_INPUT_NOT_REDUCED                           110
+# define BN_R_INVALID_LENGTH                              106
+# define BN_R_INVALID_RANGE                               115
+# define BN_R_INVALID_SHIFT                               119
+# define BN_R_NOT_A_SQUARE                                111
+# define BN_R_NOT_INITIALIZED                             107
+# define BN_R_NO_INVERSE                                  108
+# define BN_R_NO_SOLUTION                                 116
+# define BN_R_P_IS_NOT_PRIME                              112
+# define BN_R_TOO_MANY_ITERATIONS                         113
+# define BN_R_TOO_MANY_TEMPORARY_VARIABLES                109
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
--- a/openssl-1.0.2f/crypto/bn/bn.mul
+++ b/openssl-1.0.2f/crypto/bn/bn.mul
@@ -0,0 +1,19 @@
+We need
+
+* bn_mul_comba8
+* bn_mul_comba4
+* bn_mul_normal
+* bn_mul_recursive
+
+* bn_sqr_comba8
+* bn_sqr_comba4
+bn_sqr_normal -> BN_sqr
+* bn_sqr_recursive
+
+* bn_mul_low_recursive
+* bn_mul_low_normal
+* bn_mul_high
+
+* bn_mul_part_recursive	# symetric but not power of 2
+
+bn_mul_asymetric_recursive # uneven, but do the chop up.
--- a/openssl-1.0.2f/crypto/bn/bn_add.c
+++ b/openssl-1.0.2f/crypto/bn/bn_add.c
@@ -0,0 +1,313 @@
+/* crypto/bn/bn_add.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+/* r can == a or b */
+int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
+{
+    const BIGNUM *tmp;
+    int a_neg = a->neg, ret;
+
+    bn_check_top(a);
+    bn_check_top(b);
+
+    /*-
+     *  a +  b      a+b
+     *  a + -b      a-b
+     * -a +  b      b-a
+     * -a + -b      -(a+b)
+     */
+    if (a_neg ^ b->neg) {
+        /* only one is negative */
+        if (a_neg) {
+            tmp = a;
+            a = b;
+            b = tmp;
+        }
+
+        /* we are now a - b */
+
+        if (BN_ucmp(a, b) < 0) {
+            if (!BN_usub(r, b, a))
+                return (0);
+            r->neg = 1;
+        } else {
+            if (!BN_usub(r, a, b))
+                return (0);
+            r->neg = 0;
+        }
+        return (1);
+    }
+
+    ret = BN_uadd(r, a, b);
+    r->neg = a_neg;
+    bn_check_top(r);
+    return ret;
+}
+
+/* unsigned add of b to a */
+int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
+{
+    int max, min, dif;
+    BN_ULONG *ap, *bp, *rp, carry, t1, t2;
+    const BIGNUM *tmp;
+
+    bn_check_top(a);
+    bn_check_top(b);
+
+    if (a->top < b->top) {
+        tmp = a;
+        a = b;
+        b = tmp;
+    }
+    max = a->top;
+    min = b->top;
+    dif = max - min;
+
+    if (bn_wexpand(r, max + 1) == NULL)
+        return 0;
+
+    r->top = max;
+
+    ap = a->d;
+    bp = b->d;
+    rp = r->d;
+
+    carry = bn_add_words(rp, ap, bp, min);
+    rp += min;
+    ap += min;
+    bp += min;
+
+    if (carry) {
+        while (dif) {
+            dif--;
+            t1 = *(ap++);
+            t2 = (t1 + 1) & BN_MASK2;
+            *(rp++) = t2;
+            if (t2) {
+                carry = 0;
+                break;
+            }
+        }
+        if (carry) {
+            /* carry != 0 => dif == 0 */
+            *rp = 1;
+            r->top++;
+        }
+    }
+    if (dif && rp != ap)
+        while (dif--)
+            /* copy remaining words if ap != rp */
+            *(rp++) = *(ap++);
+    r->neg = 0;
+    bn_check_top(r);
+    return 1;
+}
+
+/* unsigned subtraction of b from a, a must be larger than b. */
+int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
+{
+    int max, min, dif;
+    register BN_ULONG t1, t2, *ap, *bp, *rp;
+    int i, carry;
+#if defined(IRIX_CC_BUG) && !defined(LINT)
+    int dummy;
+#endif
+
+    bn_check_top(a);
+    bn_check_top(b);
+
+    max = a->top;
+    min = b->top;
+    dif = max - min;
+
+    if (dif < 0) {              /* hmm... should not be happening */
+        BNerr(BN_F_BN_USUB, BN_R_ARG2_LT_ARG3);
+        return (0);
+    }
+
+    if (bn_wexpand(r, max) == NULL)
+        return (0);
+
+    ap = a->d;
+    bp = b->d;
+    rp = r->d;
+
+#if 1
+    carry = 0;
+    for (i = min; i != 0; i--) {
+        t1 = *(ap++);
+        t2 = *(bp++);
+        if (carry) {
+            carry = (t1 <= t2);
+            t1 = (t1 - t2 - 1) & BN_MASK2;
+        } else {
+            carry = (t1 < t2);
+            t1 = (t1 - t2) & BN_MASK2;
+        }
+# if defined(IRIX_CC_BUG) && !defined(LINT)
+        dummy = t1;
+# endif
+        *(rp++) = t1 & BN_MASK2;
+    }
+#else
+    carry = bn_sub_words(rp, ap, bp, min);
+    ap += min;
+    bp += min;
+    rp += min;
+#endif
+    if (carry) {                /* subtracted */
+        if (!dif)
+            /* error: a < b */
+            return 0;
+        while (dif) {
+            dif--;
+            t1 = *(ap++);
+            t2 = (t1 - 1) & BN_MASK2;
+            *(rp++) = t2;
+            if (t1)
+                break;
+        }
+    }
+#if 0
+    memcpy(rp, ap, sizeof(*rp) * (max - i));
+#else
+    if (rp != ap) {
+        for (;;) {
+            if (!dif--)
+                break;
+            rp[0] = ap[0];
+            if (!dif--)
+                break;
+            rp[1] = ap[1];
+            if (!dif--)
+                break;
+            rp[2] = ap[2];
+            if (!dif--)
+                break;
+            rp[3] = ap[3];
+            rp += 4;
+            ap += 4;
+        }
+    }
+#endif
+
+    r->top = max;
+    r->neg = 0;
+    bn_correct_top(r);
+    return (1);
+}
+
+int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
+{
+    int max;
+    int add = 0, neg = 0;
+    const BIGNUM *tmp;
+
+    bn_check_top(a);
+    bn_check_top(b);
+
+    /*-
+     *  a -  b      a-b
+     *  a - -b      a+b
+     * -a -  b      -(a+b)
+     * -a - -b      b-a
+     */
+    if (a->neg) {
+        if (b->neg) {
+            tmp = a;
+            a = b;
+            b = tmp;
+        } else {
+            add = 1;
+            neg = 1;
+        }
+    } else {
+        if (b->neg) {
+            add = 1;
+            neg = 0;
+        }
+    }
+
+    if (add) {
+        if (!BN_uadd(r, a, b))
+            return (0);
+        r->neg = neg;
+        return (1);
+    }
+
+    /* We are actually doing a - b :-) */
+
+    max = (a->top > b->top) ? a->top : b->top;
+    if (bn_wexpand(r, max) == NULL)
+        return (0);
+    if (BN_ucmp(a, b) < 0) {
+        if (!BN_usub(r, b, a))
+            return (0);
+        r->neg = 1;
+    } else {
+        if (!BN_usub(r, a, b))
+            return (0);
+        r->neg = 0;
+    }
+    bn_check_top(r);
+    return (1);
+}
--- a/openssl-1.0.2f/crypto/bn/bn_add.o
+++ b/openssl-1.0.2f/crypto/bn/bn_add.o
--- a/openssl-1.0.2f/crypto/bn/bn_asm.c
+++ b/openssl-1.0.2f/crypto/bn/bn_asm.c
--- a/openssl-1.0.2f/crypto/bn/bn_blind.c
+++ b/openssl-1.0.2f/crypto/bn/bn_blind.c
@@ -0,0 +1,385 @@
+/* crypto/bn/bn_blind.c */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+#define BN_BLINDING_COUNTER     32
+
+struct bn_blinding_st {
+    BIGNUM *A;
+    BIGNUM *Ai;
+    BIGNUM *e;
+    BIGNUM *mod;                /* just a reference */
+#ifndef OPENSSL_NO_DEPRECATED
+    unsigned long thread_id;    /* added in OpenSSL 0.9.6j and 0.9.7b; used
+                                 * only by crypto/rsa/rsa_eay.c, rsa_lib.c */
+#endif
+    CRYPTO_THREADID tid;
+    int counter;
+    unsigned long flags;
+    BN_MONT_CTX *m_ctx;
+    int (*bn_mod_exp) (BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+                       const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+};
+
+BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
+{
+    BN_BLINDING *ret = NULL;
+
+    bn_check_top(mod);
+
+    if ((ret = (BN_BLINDING *)OPENSSL_malloc(sizeof(BN_BLINDING))) == NULL) {
+        BNerr(BN_F_BN_BLINDING_NEW, ERR_R_MALLOC_FAILURE);
+        return (NULL);
+    }
+    memset(ret, 0, sizeof(BN_BLINDING));
+    if (A != NULL) {
+        if ((ret->A = BN_dup(A)) == NULL)
+            goto err;
+    }
+    if (Ai != NULL) {
+        if ((ret->Ai = BN_dup(Ai)) == NULL)
+            goto err;
+    }
+
+    /* save a copy of mod in the BN_BLINDING structure */
+    if ((ret->mod = BN_dup(mod)) == NULL)
+        goto err;
+    if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
+        BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
+
+    /*
+     * Set the counter to the special value -1 to indicate that this is
+     * never-used fresh blinding that does not need updating before first
+     * use.
+     */
+    ret->counter = -1;
+    CRYPTO_THREADID_current(&ret->tid);
+    return (ret);
+ err:
+    if (ret != NULL)
+        BN_BLINDING_free(ret);
+    return (NULL);
+}
+
+void BN_BLINDING_free(BN_BLINDING *r)
+{
+    if (r == NULL)
+        return;
+
+    if (r->A != NULL)
+        BN_free(r->A);
+    if (r->Ai != NULL)
+        BN_free(r->Ai);
+    if (r->e != NULL)
+        BN_free(r->e);
+    if (r->mod != NULL)
+        BN_free(r->mod);
+    OPENSSL_free(r);
+}
+
+int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
+{
+    int ret = 0;
+
+    if ((b->A == NULL) || (b->Ai == NULL)) {
+        BNerr(BN_F_BN_BLINDING_UPDATE, BN_R_NOT_INITIALIZED);
+        goto err;
+    }
+
+    if (b->counter == -1)
+        b->counter = 0;
+
+    if (++b->counter == BN_BLINDING_COUNTER && b->e != NULL &&
+        !(b->flags & BN_BLINDING_NO_RECREATE)) {
+        /* re-create blinding parameters */
+        if (!BN_BLINDING_create_param(b, NULL, NULL, ctx, NULL, NULL))
+            goto err;
+    } else if (!(b->flags & BN_BLINDING_NO_UPDATE)) {
+        if (!BN_mod_mul(b->A, b->A, b->A, b->mod, ctx))
+            goto err;
+        if (!BN_mod_mul(b->Ai, b->Ai, b->Ai, b->mod, ctx))
+            goto err;
+    }
+
+    ret = 1;
+ err:
+    if (b->counter == BN_BLINDING_COUNTER)
+        b->counter = 0;
+    return (ret);
+}
+
+int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
+{
+    return BN_BLINDING_convert_ex(n, NULL, b, ctx);
+}
+
+int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
+{
+    int ret = 1;
+
+    bn_check_top(n);
+
+    if ((b->A == NULL) || (b->Ai == NULL)) {
+        BNerr(BN_F_BN_BLINDING_CONVERT_EX, BN_R_NOT_INITIALIZED);
+        return (0);
+    }
+
+    if (b->counter == -1)
+        /* Fresh blinding, doesn't need updating. */
+        b->counter = 0;
+    else if (!BN_BLINDING_update(b, ctx))
+        return (0);
+
+    if (r != NULL) {
+        if (!BN_copy(r, b->Ai))
+            ret = 0;
+    }
+
+    if (!BN_mod_mul(n, n, b->A, b->mod, ctx))
+        ret = 0;
+
+    return ret;
+}
+
+int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
+{
+    return BN_BLINDING_invert_ex(n, NULL, b, ctx);
+}
+
+int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b,
+                          BN_CTX *ctx)
+{
+    int ret;
+
+    bn_check_top(n);
+
+    if (r != NULL)
+        ret = BN_mod_mul(n, n, r, b->mod, ctx);
+    else {
+        if (b->Ai == NULL) {
+            BNerr(BN_F_BN_BLINDING_INVERT_EX, BN_R_NOT_INITIALIZED);
+            return (0);
+        }
+        ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
+    }
+
+    bn_check_top(n);
+    return (ret);
+}
+
+#ifndef OPENSSL_NO_DEPRECATED
+unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *b)
+{
+    return b->thread_id;
+}
+
+void BN_BLINDING_set_thread_id(BN_BLINDING *b, unsigned long n)
+{
+    b->thread_id = n;
+}
+#endif
+
+CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *b)
+{
+    return &b->tid;
+}
+
+unsigned long BN_BLINDING_get_flags(const BN_BLINDING *b)
+{
+    return b->flags;
+}
+
+void BN_BLINDING_set_flags(BN_BLINDING *b, unsigned long flags)
+{
+    b->flags = flags;
+}
+
+BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
+                                      const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
+                                      int (*bn_mod_exp) (BIGNUM *r,
+                                                         const BIGNUM *a,
+                                                         const BIGNUM *p,
+                                                         const BIGNUM *m,
+                                                         BN_CTX *ctx,
+                                                         BN_MONT_CTX *m_ctx),
+                                      BN_MONT_CTX *m_ctx)
+{
+    int retry_counter = 32;
+    BN_BLINDING *ret = NULL;
+
+    if (b == NULL)
+        ret = BN_BLINDING_new(NULL, NULL, m);
+    else
+        ret = b;
+
+    if (ret == NULL)
+        goto err;
+
+    if (ret->A == NULL && (ret->A = BN_new()) == NULL)
+        goto err;
+    if (ret->Ai == NULL && (ret->Ai = BN_new()) == NULL)
+        goto err;
+
+    if (e != NULL) {
+        if (ret->e != NULL)
+            BN_free(ret->e);
+        ret->e = BN_dup(e);
+    }
+    if (ret->e == NULL)
+        goto err;
+
+    if (bn_mod_exp != NULL)
+        ret->bn_mod_exp = bn_mod_exp;
+    if (m_ctx != NULL)
+        ret->m_ctx = m_ctx;
+
+    do {
+        if (!BN_rand_range(ret->A, ret->mod))
+            goto err;
+        if (BN_mod_inverse(ret->Ai, ret->A, ret->mod, ctx) == NULL) {
+            /*
+             * this should almost never happen for good RSA keys
+             */
+            unsigned long error = ERR_peek_last_error();
+            if (ERR_GET_REASON(error) == BN_R_NO_INVERSE) {
+                if (retry_counter-- == 0) {
+                    BNerr(BN_F_BN_BLINDING_CREATE_PARAM,
+                          BN_R_TOO_MANY_ITERATIONS);
+                    goto err;
+                }
+                ERR_clear_error();
+            } else
+                goto err;
+        } else
+            break;
+    } while (1);
+
+    if (ret->bn_mod_exp != NULL && ret->m_ctx != NULL) {
+        if (!ret->bn_mod_exp
+            (ret->A, ret->A, ret->e, ret->mod, ctx, ret->m_ctx))
+            goto err;
+    } else {
+        if (!BN_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx))
+            goto err;
+    }
+
+    return ret;
+ err:
+    if (b == NULL && ret != NULL) {
+        BN_BLINDING_free(ret);
+        ret = NULL;
+    }
+
+    return ret;
+}
--- a/openssl-1.0.2f/crypto/bn/bn_blind.o
+++ b/openssl-1.0.2f/crypto/bn/bn_blind.o
--- a/openssl-1.0.2f/crypto/bn/bn_const.c
+++ b/openssl-1.0.2f/crypto/bn/bn_const.c
@@ -0,0 +1,547 @@
+/* crypto/bn/knownprimes.c */
+/* Insert boilerplate */
+
+#include "bn.h"
+
+/*-
+ * "First Oakley Default Group" from RFC2409, section 6.1.
+ *
+ * The prime is: 2^768 - 2 ^704 - 1 + 2^64 * { [2^638 pi] + 149686 }
+ *
+ * RFC2409 specifies a generator of 2.
+ * RFC2412 specifies a generator of of 22.
+ */
+
+BIGNUM *get_rfc2409_prime_768(BIGNUM *bn)
+{
+    static const unsigned char RFC2409_PRIME_768[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
+        0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
+        0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
+        0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
+        0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
+        0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
+        0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
+        0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
+        0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
+        0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x3A, 0x36, 0x20,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    };
+    return BN_bin2bn(RFC2409_PRIME_768, sizeof(RFC2409_PRIME_768), bn);
+}
+
+/*-
+ * "Second Oakley Default Group" from RFC2409, section 6.2.
+ *
+ * The prime is: 2^1024 - 2^960 - 1 + 2^64 * { [2^894 pi] + 129093 }.
+ *
+ * RFC2409 specifies a generator of 2.
+ * RFC2412 specifies a generator of 22.
+ */
+
+BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn)
+{
+    static const unsigned char RFC2409_PRIME_1024[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
+        0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
+        0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
+        0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
+        0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
+        0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
+        0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
+        0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
+        0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
+        0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
+        0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
+        0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
+        0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
+        0x49, 0x28, 0x66, 0x51, 0xEC, 0xE6, 0x53, 0x81,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    };
+    return BN_bin2bn(RFC2409_PRIME_1024, sizeof(RFC2409_PRIME_1024), bn);
+}
+
+/*-
+ * "1536-bit MODP Group" from RFC3526, Section 2.
+ *
+ * The prime is: 2^1536 - 2^1472 - 1 + 2^64 * { [2^1406 pi] + 741804 }
+ *
+ * RFC3526 specifies a generator of 2.
+ * RFC2312 specifies a generator of 22.
+ */
+
+BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn)
+{
+    static const unsigned char RFC3526_PRIME_1536[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
+        0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
+        0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
+        0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
+        0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
+        0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
+        0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
+        0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
+        0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
+        0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
+        0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
+        0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
+        0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
+        0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
+        0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
+        0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
+        0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
+        0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
+        0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
+        0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
+        0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
+        0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x23, 0x73, 0x27,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    };
+    return BN_bin2bn(RFC3526_PRIME_1536, sizeof(RFC3526_PRIME_1536), bn);
+}
+
+/*-
+ * "2048-bit MODP Group" from RFC3526, Section 3.
+ *
+ * The prime is: 2^2048 - 2^1984 - 1 + 2^64 * { [2^1918 pi] + 124476 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn)
+{
+    static const unsigned char RFC3526_PRIME_2048[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
+        0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
+        0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
+        0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
+        0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
+        0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
+        0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
+        0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
+        0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
+        0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
+        0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
+        0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
+        0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
+        0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
+        0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
+        0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
+        0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
+        0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
+        0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
+        0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
+        0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
+        0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
+        0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
+        0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
+        0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
+        0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
+        0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
+        0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
+        0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
+        0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAC, 0xAA, 0x68,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    };
+    return BN_bin2bn(RFC3526_PRIME_2048, sizeof(RFC3526_PRIME_2048), bn);
+}
+
+/*-
+ * "3072-bit MODP Group" from RFC3526, Section 4.
+ *
+ * The prime is: 2^3072 - 2^3008 - 1 + 2^64 * { [2^2942 pi] + 1690314 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn)
+{
+    static const unsigned char RFC3526_PRIME_3072[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
+        0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
+        0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
+        0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
+        0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
+        0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
+        0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
+        0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
+        0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
+        0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
+        0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
+        0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
+        0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
+        0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
+        0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
+        0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
+        0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
+        0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
+        0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
+        0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
+        0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
+        0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
+        0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
+        0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
+        0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
+        0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
+        0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
+        0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
+        0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
+        0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
+        0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
+        0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
+        0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
+        0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
+        0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
+        0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
+        0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
+        0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
+        0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
+        0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
+        0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
+        0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
+        0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
+        0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
+        0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
+        0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x3A, 0xD2, 0xCA,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    };
+    return BN_bin2bn(RFC3526_PRIME_3072, sizeof(RFC3526_PRIME_3072), bn);
+}
+
+/*-
+ * "4096-bit MODP Group" from RFC3526, Section 5.
+ *
+ * The prime is: 2^4096 - 2^4032 - 1 + 2^64 * { [2^3966 pi] + 240904 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn)
+{
+    static const unsigned char RFC3526_PRIME_4096[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
+        0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
+        0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
+        0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
+        0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
+        0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
+        0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
+        0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
+        0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
+        0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
+        0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
+        0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
+        0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
+        0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
+        0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
+        0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
+        0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
+        0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
+        0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
+        0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
+        0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
+        0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
+        0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
+        0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
+        0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
+        0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
+        0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
+        0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
+        0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
+        0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
+        0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
+        0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
+        0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
+        0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
+        0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
+        0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
+        0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
+        0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
+        0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
+        0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
+        0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
+        0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
+        0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
+        0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
+        0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
+        0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x21, 0x08, 0x01,
+        0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
+        0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26,
+        0x99, 0xC3, 0x27, 0x18, 0x6A, 0xF4, 0xE2, 0x3C,
+        0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
+        0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8,
+        0xDB, 0xBB, 0xC2, 0xDB, 0x04, 0xDE, 0x8E, 0xF9,
+        0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
+        0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D,
+        0x99, 0xB2, 0x96, 0x4F, 0xA0, 0x90, 0xC3, 0xA2,
+        0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
+        0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF,
+        0xB8, 0x1B, 0xDD, 0x76, 0x21, 0x70, 0x48, 0x1C,
+        0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
+        0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1,
+        0x86, 0xFF, 0xB7, 0xDC, 0x90, 0xA6, 0xC0, 0x8F,
+        0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x06, 0x31, 0x99,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    };
+    return BN_bin2bn(RFC3526_PRIME_4096, sizeof(RFC3526_PRIME_4096), bn);
+}
+
+/*-
+ * "6144-bit MODP Group" from RFC3526, Section 6.
+ *
+ * The prime is: 2^6144 - 2^6080 - 1 + 2^64 * { [2^6014 pi] + 929484 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn)
+{
+    static const unsigned char RFC3526_PRIME_6144[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
+        0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
+        0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
+        0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
+        0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
+        0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
+        0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
+        0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
+        0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
+        0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
+        0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
+        0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
+        0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
+        0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
+        0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
+        0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
+        0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
+        0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
+        0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
+        0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
+        0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
+        0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
+        0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
+        0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
+        0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
+        0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
+        0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
+        0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
+        0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
+        0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
+        0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
+        0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
+        0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
+        0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
+        0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
+        0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
+        0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
+        0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
+        0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
+        0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
+        0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
+        0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
+        0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
+        0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
+        0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
+        0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x21, 0x08, 0x01,
+        0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
+        0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26,
+        0x99, 0xC3, 0x27, 0x18, 0x6A, 0xF4, 0xE2, 0x3C,
+        0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
+        0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8,
+        0xDB, 0xBB, 0xC2, 0xDB, 0x04, 0xDE, 0x8E, 0xF9,
+        0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
+        0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D,
+        0x99, 0xB2, 0x96, 0x4F, 0xA0, 0x90, 0xC3, 0xA2,
+        0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
+        0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF,
+        0xB8, 0x1B, 0xDD, 0x76, 0x21, 0x70, 0x48, 0x1C,
+        0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
+        0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1,
+        0x86, 0xFF, 0xB7, 0xDC, 0x90, 0xA6, 0xC0, 0x8F,
+        0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x02, 0x84, 0x92,
+        0x36, 0xC3, 0xFA, 0xB4, 0xD2, 0x7C, 0x70, 0x26,
+        0xC1, 0xD4, 0xDC, 0xB2, 0x60, 0x26, 0x46, 0xDE,
+        0xC9, 0x75, 0x1E, 0x76, 0x3D, 0xBA, 0x37, 0xBD,
+        0xF8, 0xFF, 0x94, 0x06, 0xAD, 0x9E, 0x53, 0x0E,
+        0xE5, 0xDB, 0x38, 0x2F, 0x41, 0x30, 0x01, 0xAE,
+        0xB0, 0x6A, 0x53, 0xED, 0x90, 0x27, 0xD8, 0x31,
+        0x17, 0x97, 0x27, 0xB0, 0x86, 0x5A, 0x89, 0x18,
+        0xDA, 0x3E, 0xDB, 0xEB, 0xCF, 0x9B, 0x14, 0xED,
+        0x44, 0xCE, 0x6C, 0xBA, 0xCE, 0xD4, 0xBB, 0x1B,
+        0xDB, 0x7F, 0x14, 0x47, 0xE6, 0xCC, 0x25, 0x4B,
+        0x33, 0x20, 0x51, 0x51, 0x2B, 0xD7, 0xAF, 0x42,
+        0x6F, 0xB8, 0xF4, 0x01, 0x37, 0x8C, 0xD2, 0xBF,
+        0x59, 0x83, 0xCA, 0x01, 0xC6, 0x4B, 0x92, 0xEC,
+        0xF0, 0x32, 0xEA, 0x15, 0xD1, 0x72, 0x1D, 0x03,
+        0xF4, 0x82, 0xD7, 0xCE, 0x6E, 0x74, 0xFE, 0xF6,
+        0xD5, 0x5E, 0x70, 0x2F, 0x46, 0x98, 0x0C, 0x82,
+        0xB5, 0xA8, 0x40, 0x31, 0x90, 0x0B, 0x1C, 0x9E,
+        0x59, 0xE7, 0xC9, 0x7F, 0xBE, 0xC7, 0xE8, 0xF3,
+        0x23, 0xA9, 0x7A, 0x7E, 0x36, 0xCC, 0x88, 0xBE,
+        0x0F, 0x1D, 0x45, 0xB7, 0xFF, 0x58, 0x5A, 0xC5,
+        0x4B, 0xD4, 0x07, 0xB2, 0x2B, 0x41, 0x54, 0xAA,
+        0xCC, 0x8F, 0x6D, 0x7E, 0xBF, 0x48, 0xE1, 0xD8,
+        0x14, 0xCC, 0x5E, 0xD2, 0x0F, 0x80, 0x37, 0xE0,
+        0xA7, 0x97, 0x15, 0xEE, 0xF2, 0x9B, 0xE3, 0x28,
+        0x06, 0xA1, 0xD5, 0x8B, 0xB7, 0xC5, 0xDA, 0x76,
+        0xF5, 0x50, 0xAA, 0x3D, 0x8A, 0x1F, 0xBF, 0xF0,
+        0xEB, 0x19, 0xCC, 0xB1, 0xA3, 0x13, 0xD5, 0x5C,
+        0xDA, 0x56, 0xC9, 0xEC, 0x2E, 0xF2, 0x96, 0x32,
+        0x38, 0x7F, 0xE8, 0xD7, 0x6E, 0x3C, 0x04, 0x68,
+        0x04, 0x3E, 0x8F, 0x66, 0x3F, 0x48, 0x60, 0xEE,
+        0x12, 0xBF, 0x2D, 0x5B, 0x0B, 0x74, 0x74, 0xD6,
+        0xE6, 0x94, 0xF9, 0x1E, 0x6D, 0xCC, 0x40, 0x24,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    };
+    return BN_bin2bn(RFC3526_PRIME_6144, sizeof(RFC3526_PRIME_6144), bn);
+}
+
+/*-
+ * "8192-bit MODP Group" from RFC3526, Section 7.
+ *
+ * The prime is: 2^8192 - 2^8128 - 1 + 2^64 * { [2^8062 pi] + 4743158 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn)
+{
+    static const unsigned char RFC3526_PRIME_8192[] = {
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
+        0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
+        0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
+        0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
+        0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
+        0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
+        0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
+        0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
+        0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
+        0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
+        0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
+        0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
+        0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
+        0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
+        0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
+        0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
+        0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
+        0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
+        0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
+        0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
+        0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
+        0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
+        0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
+        0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
+        0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
+        0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
+        0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
+        0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
+        0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
+        0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
+        0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
+        0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
+        0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
+        0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
+        0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
+        0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
+        0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
+        0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
+        0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
+        0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
+        0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
+        0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
+        0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
+        0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
+        0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
+        0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x21, 0x08, 0x01,
+        0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
+        0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26,
+        0x99, 0xC3, 0x27, 0x18, 0x6A, 0xF4, 0xE2, 0x3C,
+        0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
+        0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8,
+        0xDB, 0xBB, 0xC2, 0xDB, 0x04, 0xDE, 0x8E, 0xF9,
+        0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
+        0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D,
+        0x99, 0xB2, 0x96, 0x4F, 0xA0, 0x90, 0xC3, 0xA2,
+        0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
+        0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF,
+        0xB8, 0x1B, 0xDD, 0x76, 0x21, 0x70, 0x48, 0x1C,
+        0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
+        0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1,
+        0x86, 0xFF, 0xB7, 0xDC, 0x90, 0xA6, 0xC0, 0x8F,
+        0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x02, 0x84, 0x92,
+        0x36, 0xC3, 0xFA, 0xB4, 0xD2, 0x7C, 0x70, 0x26,
+        0xC1, 0xD4, 0xDC, 0xB2, 0x60, 0x26, 0x46, 0xDE,
+        0xC9, 0x75, 0x1E, 0x76, 0x3D, 0xBA, 0x37, 0xBD,
+        0xF8, 0xFF, 0x94, 0x06, 0xAD, 0x9E, 0x53, 0x0E,
+        0xE5, 0xDB, 0x38, 0x2F, 0x41, 0x30, 0x01, 0xAE,
+        0xB0, 0x6A, 0x53, 0xED, 0x90, 0x27, 0xD8, 0x31,
+        0x17, 0x97, 0x27, 0xB0, 0x86, 0x5A, 0x89, 0x18,
+        0xDA, 0x3E, 0xDB, 0xEB, 0xCF, 0x9B, 0x14, 0xED,
+        0x44, 0xCE, 0x6C, 0xBA, 0xCE, 0xD4, 0xBB, 0x1B,
+        0xDB, 0x7F, 0x14, 0x47, 0xE6, 0xCC, 0x25, 0x4B,
+        0x33, 0x20, 0x51, 0x51, 0x2B, 0xD7, 0xAF, 0x42,
+        0x6F, 0xB8, 0xF4, 0x01, 0x37, 0x8C, 0xD2, 0xBF,
+        0x59, 0x83, 0xCA, 0x01, 0xC6, 0x4B, 0x92, 0xEC,
+        0xF0, 0x32, 0xEA, 0x15, 0xD1, 0x72, 0x1D, 0x03,
+        0xF4, 0x82, 0xD7, 0xCE, 0x6E, 0x74, 0xFE, 0xF6,
+        0xD5, 0x5E, 0x70, 0x2F, 0x46, 0x98, 0x0C, 0x82,
+        0xB5, 0xA8, 0x40, 0x31, 0x90, 0x0B, 0x1C, 0x9E,
+        0x59, 0xE7, 0xC9, 0x7F, 0xBE, 0xC7, 0xE8, 0xF3,
+        0x23, 0xA9, 0x7A, 0x7E, 0x36, 0xCC, 0x88, 0xBE,
+        0x0F, 0x1D, 0x45, 0xB7, 0xFF, 0x58, 0x5A, 0xC5,
+        0x4B, 0xD4, 0x07, 0xB2, 0x2B, 0x41, 0x54, 0xAA,
+        0xCC, 0x8F, 0x6D, 0x7E, 0xBF, 0x48, 0xE1, 0xD8,
+        0x14, 0xCC, 0x5E, 0xD2, 0x0F, 0x80, 0x37, 0xE0,
+        0xA7, 0x97, 0x15, 0xEE, 0xF2, 0x9B, 0xE3, 0x28,
+        0x06, 0xA1, 0xD5, 0x8B, 0xB7, 0xC5, 0xDA, 0x76,
+        0xF5, 0x50, 0xAA, 0x3D, 0x8A, 0x1F, 0xBF, 0xF0,
+        0xEB, 0x19, 0xCC, 0xB1, 0xA3, 0x13, 0xD5, 0x5C,
+        0xDA, 0x56, 0xC9, 0xEC, 0x2E, 0xF2, 0x96, 0x32,
+        0x38, 0x7F, 0xE8, 0xD7, 0x6E, 0x3C, 0x04, 0x68,
+        0x04, 0x3E, 0x8F, 0x66, 0x3F, 0x48, 0x60, 0xEE,
+        0x12, 0xBF, 0x2D, 0x5B, 0x0B, 0x74, 0x74, 0xD6,
+        0xE6, 0x94, 0xF9, 0x1E, 0x6D, 0xBE, 0x11, 0x59,
+        0x74, 0xA3, 0x92, 0x6F, 0x12, 0xFE, 0xE5, 0xE4,
+        0x38, 0x77, 0x7C, 0xB6, 0xA9, 0x32, 0xDF, 0x8C,
+        0xD8, 0xBE, 0xC4, 0xD0, 0x73, 0xB9, 0x31, 0xBA,
+        0x3B, 0xC8, 0x32, 0xB6, 0x8D, 0x9D, 0xD3, 0x00,
+        0x74, 0x1F, 0xA7, 0xBF, 0x8A, 0xFC, 0x47, 0xED,
+        0x25, 0x76, 0xF6, 0x93, 0x6B, 0xA4, 0x24, 0x66,
+        0x3A, 0xAB, 0x63, 0x9C, 0x5A, 0xE4, 0xF5, 0x68,
+        0x34, 0x23, 0xB4, 0x74, 0x2B, 0xF1, 0xC9, 0x78,
+        0x23, 0x8F, 0x16, 0xCB, 0xE3, 0x9D, 0x65, 0x2D,
+        0xE3, 0xFD, 0xB8, 0xBE, 0xFC, 0x84, 0x8A, 0xD9,
+        0x22, 0x22, 0x2E, 0x04, 0xA4, 0x03, 0x7C, 0x07,
+        0x13, 0xEB, 0x57, 0xA8, 0x1A, 0x23, 0xF0, 0xC7,
+        0x34, 0x73, 0xFC, 0x64, 0x6C, 0xEA, 0x30, 0x6B,
+        0x4B, 0xCB, 0xC8, 0x86, 0x2F, 0x83, 0x85, 0xDD,
+        0xFA, 0x9D, 0x4B, 0x7F, 0xA2, 0xC0, 0x87, 0xE8,
+        0x79, 0x68, 0x33, 0x03, 0xED, 0x5B, 0xDD, 0x3A,
+        0x06, 0x2B, 0x3C, 0xF5, 0xB3, 0xA2, 0x78, 0xA6,
+        0x6D, 0x2A, 0x13, 0xF8, 0x3F, 0x44, 0xF8, 0x2D,
+        0xDF, 0x31, 0x0E, 0xE0, 0x74, 0xAB, 0x6A, 0x36,
+        0x45, 0x97, 0xE8, 0x99, 0xA0, 0x25, 0x5D, 0xC1,
+        0x64, 0xF3, 0x1C, 0xC5, 0x08, 0x46, 0x85, 0x1D,
+        0xF9, 0xAB, 0x48, 0x19, 0x5D, 0xED, 0x7E, 0xA1,
+        0xB1, 0xD5, 0x10, 0xBD, 0x7E, 0xE7, 0x4D, 0x73,
+        0xFA, 0xF3, 0x6B, 0xC3, 0x1E, 0xCF, 0xA2, 0x68,
+        0x35, 0x90, 0x46, 0xF4, 0xEB, 0x87, 0x9F, 0x92,
+        0x40, 0x09, 0x43, 0x8B, 0x48, 0x1C, 0x6C, 0xD7,
+        0x88, 0x9A, 0x00, 0x2E, 0xD5, 0xEE, 0x38, 0x2B,
+        0xC9, 0x19, 0x0D, 0xA6, 0xFC, 0x02, 0x6E, 0x47,
+        0x95, 0x58, 0xE4, 0x47, 0x56, 0x77, 0xE9, 0xAA,
+        0x9E, 0x30, 0x50, 0xE2, 0x76, 0x56, 0x94, 0xDF,
+        0xC8, 0x1F, 0x56, 0xE8, 0x80, 0xB9, 0x6E, 0x71,
+        0x60, 0xC9, 0x80, 0xDD, 0x98, 0xED, 0xD3, 0xDF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    };
+    return BN_bin2bn(RFC3526_PRIME_8192, sizeof(RFC3526_PRIME_8192), bn);
+}
--- a/openssl-1.0.2f/crypto/bn/bn_const.o
+++ b/openssl-1.0.2f/crypto/bn/bn_const.o
--- a/openssl-1.0.2f/crypto/bn/bn_ctx.c
+++ b/openssl-1.0.2f/crypto/bn/bn_ctx.c
@@ -0,0 +1,448 @@
+/* crypto/bn/bn_ctx.c */
+/* Written by Ulf Moeller for the OpenSSL project. */
+/* ====================================================================
+ * Copyright (c) 1998-2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#if !defined(BN_CTX_DEBUG) && !defined(BN_DEBUG)
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+/*-
+ * TODO list
+ *
+ * 1. Check a bunch of "(words+1)" type hacks in various bignum functions and
+ * check they can be safely removed.
+ *  - Check +1 and other ugliness in BN_from_montgomery()
+ *
+ * 2. Consider allowing a BN_new_ex() that, at least, lets you specify an
+ * appropriate 'block' size that will be honoured by bn_expand_internal() to
+ * prevent piddly little reallocations. OTOH, profiling bignum expansions in
+ * BN_CTX doesn't show this to be a big issue.
+ */
+
+/* How many bignums are in each "pool item"; */
+#define BN_CTX_POOL_SIZE        16
+/* The stack frame info is resizing, set a first-time expansion size; */
+#define BN_CTX_START_FRAMES     32
+
+/***********/
+/* BN_POOL */
+/***********/
+
+/* A bundle of bignums that can be linked with other bundles */
+typedef struct bignum_pool_item {
+    /* The bignum values */
+    BIGNUM vals[BN_CTX_POOL_SIZE];
+    /* Linked-list admin */
+    struct bignum_pool_item *prev, *next;
+} BN_POOL_ITEM;
+/* A linked-list of bignums grouped in bundles */
+typedef struct bignum_pool {
+    /* Linked-list admin */
+    BN_POOL_ITEM *head, *current, *tail;
+    /* Stack depth and allocation size */
+    unsigned used, size;
+} BN_POOL;
+static void BN_POOL_init(BN_POOL *);
+static void BN_POOL_finish(BN_POOL *);
+#ifndef OPENSSL_NO_DEPRECATED
+static void BN_POOL_reset(BN_POOL *);
+#endif
+static BIGNUM *BN_POOL_get(BN_POOL *);
+static void BN_POOL_release(BN_POOL *, unsigned int);
+
+/************/
+/* BN_STACK */
+/************/
+
+/* A wrapper to manage the "stack frames" */
+typedef struct bignum_ctx_stack {
+    /* Array of indexes into the bignum stack */
+    unsigned int *indexes;
+    /* Number of stack frames, and the size of the allocated array */
+    unsigned int depth, size;
+} BN_STACK;
+static void BN_STACK_init(BN_STACK *);
+static void BN_STACK_finish(BN_STACK *);
+#ifndef OPENSSL_NO_DEPRECATED
+static void BN_STACK_reset(BN_STACK *);
+#endif
+static int BN_STACK_push(BN_STACK *, unsigned int);
+static unsigned int BN_STACK_pop(BN_STACK *);
+
+/**********/
+/* BN_CTX */
+/**********/
+
+/* The opaque BN_CTX type */
+struct bignum_ctx {
+    /* The bignum bundles */
+    BN_POOL pool;
+    /* The "stack frames", if you will */
+    BN_STACK stack;
+    /* The number of bignums currently assigned */
+    unsigned int used;
+    /* Depth of stack overflow */
+    int err_stack;
+    /* Block "gets" until an "end" (compatibility behaviour) */
+    int too_many;
+};
+
+/* Enable this to find BN_CTX bugs */
+#ifdef BN_CTX_DEBUG
+static const char *ctxdbg_cur = NULL;
+static void ctxdbg(BN_CTX *ctx)
+{
+    unsigned int bnidx = 0, fpidx = 0;
+    BN_POOL_ITEM *item = ctx->pool.head;
+    BN_STACK *stack = &ctx->stack;
+    fprintf(stderr, "(%16p): ", ctx);
+    while (bnidx < ctx->used) {
+        fprintf(stderr, "%03x ", item->vals[bnidx++ % BN_CTX_POOL_SIZE].dmax);
+        if (!(bnidx % BN_CTX_POOL_SIZE))
+            item = item->next;
+    }
+    fprintf(stderr, "\n");
+    bnidx = 0;
+    fprintf(stderr, "          : ");
+    while (fpidx < stack->depth) {
+        while (bnidx++ < stack->indexes[fpidx])
+            fprintf(stderr, "    ");
+        fprintf(stderr, "^^^ ");
+        bnidx++;
+        fpidx++;
+    }
+    fprintf(stderr, "\n");
+}
+
+# define CTXDBG_ENTRY(str, ctx)  do { \
+                                ctxdbg_cur = (str); \
+                                fprintf(stderr,"Starting %s\n", ctxdbg_cur); \
+                                ctxdbg(ctx); \
+                                } while(0)
+# define CTXDBG_EXIT(ctx)        do { \
+                                fprintf(stderr,"Ending %s\n", ctxdbg_cur); \
+                                ctxdbg(ctx); \
+                                } while(0)
+# define CTXDBG_RET(ctx,ret)
+#else
+# define CTXDBG_ENTRY(str, ctx)
+# define CTXDBG_EXIT(ctx)
+# define CTXDBG_RET(ctx,ret)
+#endif
+
+/*
+ * This function is an evil legacy and should not be used. This
+ * implementation is WYSIWYG, though I've done my best.
+ */
+#ifndef OPENSSL_NO_DEPRECATED
+void BN_CTX_init(BN_CTX *ctx)
+{
+    /*
+     * Assume the caller obtained the context via BN_CTX_new() and so is
+     * trying to reset it for use. Nothing else makes sense, least of all
+     * binary compatibility from a time when they could declare a static
+     * variable.
+     */
+    BN_POOL_reset(&ctx->pool);
+    BN_STACK_reset(&ctx->stack);
+    ctx->used = 0;
+    ctx->err_stack = 0;
+    ctx->too_many = 0;
+}
+#endif
+
+BN_CTX *BN_CTX_new(void)
+{
+    BN_CTX *ret = OPENSSL_malloc(sizeof(BN_CTX));
+    if (!ret) {
+        BNerr(BN_F_BN_CTX_NEW, ERR_R_MALLOC_FAILURE);
+        return NULL;
+    }
+    /* Initialise the structure */
+    BN_POOL_init(&ret->pool);
+    BN_STACK_init(&ret->stack);
+    ret->used = 0;
+    ret->err_stack = 0;
+    ret->too_many = 0;
+    return ret;
+}
+
+void BN_CTX_free(BN_CTX *ctx)
+{
+    if (ctx == NULL)
+        return;
+#ifdef BN_CTX_DEBUG
+    {
+        BN_POOL_ITEM *pool = ctx->pool.head;
+        fprintf(stderr, "BN_CTX_free, stack-size=%d, pool-bignums=%d\n",
+                ctx->stack.size, ctx->pool.size);
+        fprintf(stderr, "dmaxs: ");
+        while (pool) {
+            unsigned loop = 0;
+            while (loop < BN_CTX_POOL_SIZE)
+                fprintf(stderr, "%02x ", pool->vals[loop++].dmax);
+            pool = pool->next;
+        }
+        fprintf(stderr, "\n");
+    }
+#endif
+    BN_STACK_finish(&ctx->stack);
+    BN_POOL_finish(&ctx->pool);
+    OPENSSL_free(ctx);
+}
+
+void BN_CTX_start(BN_CTX *ctx)
+{
+    CTXDBG_ENTRY("BN_CTX_start", ctx);
+    /* If we're already overflowing ... */
+    if (ctx->err_stack || ctx->too_many)
+        ctx->err_stack++;
+    /* (Try to) get a new frame pointer */
+    else if (!BN_STACK_push(&ctx->stack, ctx->used)) {
+        BNerr(BN_F_BN_CTX_START, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
+        ctx->err_stack++;
+    }
+    CTXDBG_EXIT(ctx);
+}
+
+void BN_CTX_end(BN_CTX *ctx)
+{
+    CTXDBG_ENTRY("BN_CTX_end", ctx);
+    if (ctx->err_stack)
+        ctx->err_stack--;
+    else {
+        unsigned int fp = BN_STACK_pop(&ctx->stack);
+        /* Does this stack frame have anything to release? */
+        if (fp < ctx->used)
+            BN_POOL_release(&ctx->pool, ctx->used - fp);
+        ctx->used = fp;
+        /* Unjam "too_many" in case "get" had failed */
+        ctx->too_many = 0;
+    }
+    CTXDBG_EXIT(ctx);
+}
+
+BIGNUM *BN_CTX_get(BN_CTX *ctx)
+{
+    BIGNUM *ret;
+    CTXDBG_ENTRY("BN_CTX_get", ctx);
+    if (ctx->err_stack || ctx->too_many)
+        return NULL;
+    if ((ret = BN_POOL_get(&ctx->pool)) == NULL) {
+        /*
+         * Setting too_many prevents repeated "get" attempts from cluttering
+         * the error stack.
+         */
+        ctx->too_many = 1;
+        BNerr(BN_F_BN_CTX_GET, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
+        return NULL;
+    }
+    /* OK, make sure the returned bignum is "zero" */
+    BN_zero(ret);
+    ctx->used++;
+    CTXDBG_RET(ctx, ret);
+    return ret;
+}
+
+/************/
+/* BN_STACK */
+/************/
+
+static void BN_STACK_init(BN_STACK *st)
+{
+    st->indexes = NULL;
+    st->depth = st->size = 0;
+}
+
+static void BN_STACK_finish(BN_STACK *st)
+{
+    if (st->size)
+        OPENSSL_free(st->indexes);
+}
+
+#ifndef OPENSSL_NO_DEPRECATED
+static void BN_STACK_reset(BN_STACK *st)
+{
+    st->depth = 0;
+}
+#endif
+
+static int BN_STACK_push(BN_STACK *st, unsigned int idx)
+{
+    if (st->depth == st->size)
+        /* Need to expand */
+    {
+        unsigned int newsize = (st->size ?
+                                (st->size * 3 / 2) : BN_CTX_START_FRAMES);
+        unsigned int *newitems = OPENSSL_malloc(newsize *
+                                                sizeof(unsigned int));
+        if (!newitems)
+            return 0;
+        if (st->depth)
+            memcpy(newitems, st->indexes, st->depth * sizeof(unsigned int));
+        if (st->size)
+            OPENSSL_free(st->indexes);
+        st->indexes = newitems;
+        st->size = newsize;
+    }
+    st->indexes[(st->depth)++] = idx;
+    return 1;
+}
+
+static unsigned int BN_STACK_pop(BN_STACK *st)
+{
+    return st->indexes[--(st->depth)];
+}
+
+/***********/
+/* BN_POOL */
+/***********/
+
+static void BN_POOL_init(BN_POOL *p)
+{
+    p->head = p->current = p->tail = NULL;
+    p->used = p->size = 0;
+}
+
+static void BN_POOL_finish(BN_POOL *p)
+{
+    while (p->head) {
+        unsigned int loop = 0;
+        BIGNUM *bn = p->head->vals;
+        while (loop++ < BN_CTX_POOL_SIZE) {
+            if (bn->d)
+                BN_clear_free(bn);
+            bn++;
+        }
+        p->current = p->head->next;
+        OPENSSL_free(p->head);
+        p->head = p->current;
+    }
+}
+
+#ifndef OPENSSL_NO_DEPRECATED
+static void BN_POOL_reset(BN_POOL *p)
+{
+    BN_POOL_ITEM *item = p->head;
+    while (item) {
+        unsigned int loop = 0;
+        BIGNUM *bn = item->vals;
+        while (loop++ < BN_CTX_POOL_SIZE) {
+            if (bn->d)
+                BN_clear(bn);
+            bn++;
+        }
+        item = item->next;
+    }
+    p->current = p->head;
+    p->used = 0;
+}
+#endif
+
+static BIGNUM *BN_POOL_get(BN_POOL *p)
+{
+    if (p->used == p->size) {
+        BIGNUM *bn;
+        unsigned int loop = 0;
+        BN_POOL_ITEM *item = OPENSSL_malloc(sizeof(BN_POOL_ITEM));
+        if (!item)
+            return NULL;
+        /* Initialise the structure */
+        bn = item->vals;
+        while (loop++ < BN_CTX_POOL_SIZE)
+            BN_init(bn++);
+        item->prev = p->tail;
+        item->next = NULL;
+        /* Link it in */
+        if (!p->head)
+            p->head = p->current = p->tail = item;
+        else {
+            p->tail->next = item;
+            p->tail = item;
+            p->current = item;
+        }
+        p->size += BN_CTX_POOL_SIZE;
+        p->used++;
+        /* Return the first bignum from the new pool */
+        return item->vals;
+    }
+    if (!p->used)
+        p->current = p->head;
+    else if ((p->used % BN_CTX_POOL_SIZE) == 0)
+        p->current = p->current->next;
+    return p->current->vals + ((p->used++) % BN_CTX_POOL_SIZE);
+}
+
+static void BN_POOL_release(BN_POOL *p, unsigned int num)
+{
+    unsigned int offset = (p->used - 1) % BN_CTX_POOL_SIZE;
+    p->used -= num;
+    while (num--) {
+        bn_check_top(p->current->vals + offset);
+        if (!offset) {
+            offset = BN_CTX_POOL_SIZE - 1;
+            p->current = p->current->prev;
+        } else
+            offset--;
+    }
+}
--- a/openssl-1.0.2f/crypto/bn/bn_ctx.o
+++ b/openssl-1.0.2f/crypto/bn/bn_ctx.o
--- a/openssl-1.0.2f/crypto/bn/bn_depr.c
+++ b/openssl-1.0.2f/crypto/bn/bn_depr.c
@@ -0,0 +1,115 @@
+/* crypto/bn/bn_depr.c */
+/* ====================================================================
+ * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+/*
+ * Support for deprecated functions goes here - static linkage will only
+ * slurp this code if applications are using them directly.
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+#include <openssl/rand.h>
+
+static void *dummy = &dummy;
+
+#ifndef OPENSSL_NO_DEPRECATED
+BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe,
+                          const BIGNUM *add, const BIGNUM *rem,
+                          void (*callback) (int, int, void *), void *cb_arg)
+{
+    BN_GENCB cb;
+    BIGNUM *rnd = NULL;
+    int found = 0;
+
+    BN_GENCB_set_old(&cb, callback, cb_arg);
+
+    if (ret == NULL) {
+        if ((rnd = BN_new()) == NULL)
+            goto err;
+    } else
+        rnd = ret;
+    if (!BN_generate_prime_ex(rnd, bits, safe, add, rem, &cb))
+        goto err;
+
+    /* we have a prime :-) */
+    found = 1;
+ err:
+    if (!found && (ret == NULL) && (rnd != NULL))
+        BN_free(rnd);
+    return (found ? rnd : NULL);
+}
+
+int BN_is_prime(const BIGNUM *a, int checks,
+                void (*callback) (int, int, void *), BN_CTX *ctx_passed,
+                void *cb_arg)
+{
+    BN_GENCB cb;
+    BN_GENCB_set_old(&cb, callback, cb_arg);
+    return BN_is_prime_ex(a, checks, ctx_passed, &cb);
+}
+
+int BN_is_prime_fasttest(const BIGNUM *a, int checks,
+                         void (*callback) (int, int, void *),
+                         BN_CTX *ctx_passed, void *cb_arg,
+                         int do_trial_division)
+{
+    BN_GENCB cb;
+    BN_GENCB_set_old(&cb, callback, cb_arg);
+    return BN_is_prime_fasttest_ex(a, checks, ctx_passed,
+                                   do_trial_division, &cb);
+}
+#endif
--- a/openssl-1.0.2f/crypto/bn/bn_depr.o
+++ b/openssl-1.0.2f/crypto/bn/bn_depr.o
--- a/openssl-1.0.2f/crypto/bn/bn_div.c
+++ b/openssl-1.0.2f/crypto/bn/bn_div.c
@@ -0,0 +1,477 @@
+/* crypto/bn/bn_div.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include <openssl/bn.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+/* The old slow way */
+#if 0
+int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
+           BN_CTX *ctx)
+{
+    int i, nm, nd;
+    int ret = 0;
+    BIGNUM *D;
+
+    bn_check_top(m);
+    bn_check_top(d);
+    if (BN_is_zero(d)) {
+        BNerr(BN_F_BN_DIV, BN_R_DIV_BY_ZERO);
+        return (0);
+    }
+
+    if (BN_ucmp(m, d) < 0) {
+        if (rem != NULL) {
+            if (BN_copy(rem, m) == NULL)
+                return (0);
+        }
+        if (dv != NULL)
+            BN_zero(dv);
+        return (1);
+    }
+
+    BN_CTX_start(ctx);
+    D = BN_CTX_get(ctx);
+    if (dv == NULL)
+        dv = BN_CTX_get(ctx);
+    if (rem == NULL)
+        rem = BN_CTX_get(ctx);
+    if (D == NULL || dv == NULL || rem == NULL)
+        goto end;
+
+    nd = BN_num_bits(d);
+    nm = BN_num_bits(m);
+    if (BN_copy(D, d) == NULL)
+        goto end;
+    if (BN_copy(rem, m) == NULL)
+        goto end;
+
+    /*
+     * The next 2 are needed so we can do a dv->d[0]|=1 later since
+     * BN_lshift1 will only work once there is a value :-)
+     */
+    BN_zero(dv);
+    if (bn_wexpand(dv, 1) == NULL)
+        goto end;
+    dv->top = 1;
+
+    if (!BN_lshift(D, D, nm - nd))
+        goto end;
+    for (i = nm - nd; i >= 0; i--) {
+        if (!BN_lshift1(dv, dv))
+            goto end;
+        if (BN_ucmp(rem, D) >= 0) {
+            dv->d[0] |= 1;
+            if (!BN_usub(rem, rem, D))
+                goto end;
+        }
+/* CAN IMPROVE (and have now :=) */
+        if (!BN_rshift1(D, D))
+            goto end;
+    }
+    rem->neg = BN_is_zero(rem) ? 0 : m->neg;
+    dv->neg = m->neg ^ d->neg;
+    ret = 1;
+ end:
+    BN_CTX_end(ctx);
+    return (ret);
+}
+
+#else
+
+# if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) \
+    && !defined(PEDANTIC) && !defined(BN_DIV3W)
+#  if defined(__GNUC__) && __GNUC__>=2
+#   if defined(__i386) || defined (__i386__)
+   /*-
+    * There were two reasons for implementing this template:
+    * - GNU C generates a call to a function (__udivdi3 to be exact)
+    *   in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
+    *   understand why...);
+    * - divl doesn't only calculate quotient, but also leaves
+    *   remainder in %edx which we can definitely use here:-)
+    *
+    *                                   <appro@fy.chalmers.se>
+    */
+#    undef bn_div_words
+#    define bn_div_words(n0,n1,d0)                \
+        ({  asm volatile (                      \
+                "divl   %4"                     \
+                : "=a"(q), "=d"(rem)            \
+                : "a"(n1), "d"(n0), "g"(d0)     \
+                : "cc");                        \
+            q;                                  \
+        })
+#    define REMAINDER_IS_ALREADY_CALCULATED
+#   elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
+   /*
+    * Same story here, but it's 128-bit by 64-bit division. Wow!
+    *                                   <appro@fy.chalmers.se>
+    */
+#    undef bn_div_words
+#    define bn_div_words(n0,n1,d0)                \
+        ({  asm volatile (                      \
+                "divq   %4"                     \
+                : "=a"(q), "=d"(rem)            \
+                : "a"(n1), "d"(n0), "g"(d0)     \
+                : "cc");                        \
+            q;                                  \
+        })
+#    define REMAINDER_IS_ALREADY_CALCULATED
+#   endif                       /* __<cpu> */
+#  endif                        /* __GNUC__ */
+# endif                         /* OPENSSL_NO_ASM */
+
+/*-
+ * BN_div computes  dv := num / divisor,  rounding towards
+ * zero, and sets up rm  such that  dv*divisor + rm = num  holds.
+ * Thus:
+ *     dv->neg == num->neg ^ divisor->neg  (unless the result is zero)
+ *     rm->neg == num->neg                 (unless the remainder is zero)
+ * If 'dv' or 'rm' is NULL, the respective value is not returned.
+ */
+int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
+           BN_CTX *ctx)
+{
+    int norm_shift, i, loop;
+    BIGNUM *tmp, wnum, *snum, *sdiv, *res;
+    BN_ULONG *resp, *wnump;
+    BN_ULONG d0, d1;
+    int num_n, div_n;
+    int no_branch = 0;
+
+    /*
+     * Invalid zero-padding would have particularly bad consequences so don't
+     * just rely on bn_check_top() here (bn_check_top() works only for
+     * BN_DEBUG builds)
+     */
+    if ((num->top > 0 && num->d[num->top - 1] == 0) ||
+        (divisor->top > 0 && divisor->d[divisor->top - 1] == 0)) {
+        BNerr(BN_F_BN_DIV, BN_R_NOT_INITIALIZED);
+        return 0;
+    }
+
+    bn_check_top(num);
+    bn_check_top(divisor);
+
+    if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0)
+        || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0)) {
+        no_branch = 1;
+    }
+
+    bn_check_top(dv);
+    bn_check_top(rm);
+    /*- bn_check_top(num); *//*
+     * 'num' has been checked already
+     */
+    /*- bn_check_top(divisor); *//*
+     * 'divisor' has been checked already
+     */
+
+    if (BN_is_zero(divisor)) {
+        BNerr(BN_F_BN_DIV, BN_R_DIV_BY_ZERO);
+        return (0);
+    }
+
+    if (!no_branch && BN_ucmp(num, divisor) < 0) {
+        if (rm != NULL) {
+            if (BN_copy(rm, num) == NULL)
+                return (0);
+        }
+        if (dv != NULL)
+            BN_zero(dv);
+        return (1);
+    }
+
+    BN_CTX_start(ctx);
+    tmp = BN_CTX_get(ctx);
+    snum = BN_CTX_get(ctx);
+    sdiv = BN_CTX_get(ctx);
+    if (dv == NULL)
+        res = BN_CTX_get(ctx);
+    else
+        res = dv;
+    if (sdiv == NULL || res == NULL || tmp == NULL || snum == NULL)
+        goto err;
+
+    /* First we normalise the numbers */
+    norm_shift = BN_BITS2 - ((BN_num_bits(divisor)) % BN_BITS2);
+    if (!(BN_lshift(sdiv, divisor, norm_shift)))
+        goto err;
+    sdiv->neg = 0;
+    norm_shift += BN_BITS2;
+    if (!(BN_lshift(snum, num, norm_shift)))
+        goto err;
+    snum->neg = 0;
+
+    if (no_branch) {
+        /*
+         * Since we don't know whether snum is larger than sdiv, we pad snum
+         * with enough zeroes without changing its value.
+         */
+        if (snum->top <= sdiv->top + 1) {
+            if (bn_wexpand(snum, sdiv->top + 2) == NULL)
+                goto err;
+            for (i = snum->top; i < sdiv->top + 2; i++)
+                snum->d[i] = 0;
+            snum->top = sdiv->top + 2;
+        } else {
+            if (bn_wexpand(snum, snum->top + 1) == NULL)
+                goto err;
+            snum->d[snum->top] = 0;
+            snum->top++;
+        }
+    }
+
+    div_n = sdiv->top;
+    num_n = snum->top;
+    loop = num_n - div_n;
+    /*
+     * Lets setup a 'window' into snum This is the part that corresponds to
+     * the current 'area' being divided
+     */
+    wnum.neg = 0;
+    wnum.d = &(snum->d[loop]);
+    wnum.top = div_n;
+    /*
+     * only needed when BN_ucmp messes up the values between top and max
+     */
+    wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */
+
+    /* Get the top 2 words of sdiv */
+    /* div_n=sdiv->top; */
+    d0 = sdiv->d[div_n - 1];
+    d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];
+
+    /* pointer to the 'top' of snum */
+    wnump = &(snum->d[num_n - 1]);
+
+    /* Setup to 'res' */
+    res->neg = (num->neg ^ divisor->neg);
+    if (!bn_wexpand(res, (loop + 1)))
+        goto err;
+    res->top = loop - no_branch;
+    resp = &(res->d[loop - 1]);
+
+    /* space for temp */
+    if (!bn_wexpand(tmp, (div_n + 1)))
+        goto err;
+
+    if (!no_branch) {
+        if (BN_ucmp(&wnum, sdiv) >= 0) {
+            /*
+             * If BN_DEBUG_RAND is defined BN_ucmp changes (via bn_pollute)
+             * the const bignum arguments => clean the values between top and
+             * max again
+             */
+            bn_clear_top2max(&wnum);
+            bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
+            *resp = 1;
+        } else
+            res->top--;
+    }
+
+    /*
+     * if res->top == 0 then clear the neg value otherwise decrease the resp
+     * pointer
+     */
+    if (res->top == 0)
+        res->neg = 0;
+    else
+        resp--;
+
+    for (i = 0; i < loop - 1; i++, wnump--, resp--) {
+        BN_ULONG q, l0;
+        /*
+         * the first part of the loop uses the top two words of snum and sdiv
+         * to calculate a BN_ULONG q such that | wnum - sdiv * q | < sdiv
+         */
+# if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
+        BN_ULONG bn_div_3_words(BN_ULONG *, BN_ULONG, BN_ULONG);
+        q = bn_div_3_words(wnump, d1, d0);
+# else
+        BN_ULONG n0, n1, rem = 0;
+
+        n0 = wnump[0];
+        n1 = wnump[-1];
+        if (n0 == d0)
+            q = BN_MASK2;
+        else {                  /* n0 < d0 */
+
+#  ifdef BN_LLONG
+            BN_ULLONG t2;
+
+#   if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
+            q = (BN_ULONG)(((((BN_ULLONG) n0) << BN_BITS2) | n1) / d0);
+#   else
+            q = bn_div_words(n0, n1, d0);
+#    ifdef BN_DEBUG_LEVITTE
+            fprintf(stderr, "DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
+X) -> 0x%08X\n", n0, n1, d0, q);
+#    endif
+#   endif
+
+#   ifndef REMAINDER_IS_ALREADY_CALCULATED
+            /*
+             * rem doesn't have to be BN_ULLONG. The least we
+             * know it's less that d0, isn't it?
+             */
+            rem = (n1 - q * d0) & BN_MASK2;
+#   endif
+            t2 = (BN_ULLONG) d1 *q;
+
+            for (;;) {
+                if (t2 <= ((((BN_ULLONG) rem) << BN_BITS2) | wnump[-2]))
+                    break;
+                q--;
+                rem += d0;
+                if (rem < d0)
+                    break;      /* don't let rem overflow */
+                t2 -= d1;
+            }
+#  else                         /* !BN_LLONG */
+            BN_ULONG t2l, t2h;
+
+            q = bn_div_words(n0, n1, d0);
+#   ifdef BN_DEBUG_LEVITTE
+            fprintf(stderr, "DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
+X) -> 0x%08X\n", n0, n1, d0, q);
+#   endif
+#   ifndef REMAINDER_IS_ALREADY_CALCULATED
+            rem = (n1 - q * d0) & BN_MASK2;
+#   endif
+
+#   if defined(BN_UMULT_LOHI)
+            BN_UMULT_LOHI(t2l, t2h, d1, q);
+#   elif defined(BN_UMULT_HIGH)
+            t2l = d1 * q;
+            t2h = BN_UMULT_HIGH(d1, q);
+#   else
+            {
+                BN_ULONG ql, qh;
+                t2l = LBITS(d1);
+                t2h = HBITS(d1);
+                ql = LBITS(q);
+                qh = HBITS(q);
+                mul64(t2l, t2h, ql, qh); /* t2=(BN_ULLONG)d1*q; */
+            }
+#   endif
+
+            for (;;) {
+                if ((t2h < rem) || ((t2h == rem) && (t2l <= wnump[-2])))
+                    break;
+                q--;
+                rem += d0;
+                if (rem < d0)
+                    break;      /* don't let rem overflow */
+                if (t2l < d1)
+                    t2h--;
+                t2l -= d1;
+            }
+#  endif                        /* !BN_LLONG */
+        }
+# endif                         /* !BN_DIV3W */
+
+        l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q);
+        tmp->d[div_n] = l0;
+        wnum.d--;
+        /*
+         * ingore top values of the bignums just sub the two BN_ULONG arrays
+         * with bn_sub_words
+         */
+        if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n + 1)) {
+            /*
+             * Note: As we have considered only the leading two BN_ULONGs in
+             * the calculation of q, sdiv * q might be greater than wnum (but
+             * then (q-1) * sdiv is less or equal than wnum)
+             */
+            q--;
+            if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
+                /*
+                 * we can't have an overflow here (assuming that q != 0, but
+                 * if q == 0 then tmp is zero anyway)
+                 */
+                (*wnump)++;
+        }
+        /* store part of the result */
+        *resp = q;
+    }
+    bn_correct_top(snum);
+    if (rm != NULL) {
+        /*
+         * Keep a copy of the neg flag in num because if rm==num BN_rshift()
+         * will overwrite it.
+         */
+        int neg = num->neg;
+        BN_rshift(rm, snum, norm_shift);
+        if (!BN_is_zero(rm))
+            rm->neg = neg;
+        bn_check_top(rm);
+    }
+    if (no_branch)
+        bn_correct_top(res);
+    BN_CTX_end(ctx);
+    return (1);
+ err:
+    bn_check_top(rm);
+    BN_CTX_end(ctx);
+    return (0);
+}
+#endif
--- a/openssl-1.0.2f/crypto/bn/bn_div.o
+++ b/openssl-1.0.2f/crypto/bn/bn_div.o
--- a/openssl-1.0.2f/crypto/bn/bn_err.c
+++ b/openssl-1.0.2f/crypto/bn/bn_err.c
@@ -0,0 +1,154 @@
+/* crypto/bn/bn_err.c */
+/* ====================================================================
+ * Copyright (c) 1999-2015 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+/*
+ * NOTE: this file was auto generated by the mkerr.pl script: any changes
+ * made to it will be overwritten when the script next updates this file,
+ * only reason strings will be preserved.
+ */
+
+#include <stdio.h>
+#include <openssl/err.h>
+#include <openssl/bn.h>
+
+/* BEGIN ERROR CODES */
+#ifndef OPENSSL_NO_ERR
+
+# define ERR_FUNC(func) ERR_PACK(ERR_LIB_BN,func,0)
+# define ERR_REASON(reason) ERR_PACK(ERR_LIB_BN,0,reason)
+
+static ERR_STRING_DATA BN_str_functs[] = {
+    {ERR_FUNC(BN_F_BNRAND), "BNRAND"},
+    {ERR_FUNC(BN_F_BN_BLINDING_CONVERT_EX), "BN_BLINDING_convert_ex"},
+    {ERR_FUNC(BN_F_BN_BLINDING_CREATE_PARAM), "BN_BLINDING_create_param"},
+    {ERR_FUNC(BN_F_BN_BLINDING_INVERT_EX), "BN_BLINDING_invert_ex"},
+    {ERR_FUNC(BN_F_BN_BLINDING_NEW), "BN_BLINDING_new"},
+    {ERR_FUNC(BN_F_BN_BLINDING_UPDATE), "BN_BLINDING_update"},
+    {ERR_FUNC(BN_F_BN_BN2DEC), "BN_bn2dec"},
+    {ERR_FUNC(BN_F_BN_BN2HEX), "BN_bn2hex"},
+    {ERR_FUNC(BN_F_BN_CTX_GET), "BN_CTX_get"},
+    {ERR_FUNC(BN_F_BN_CTX_NEW), "BN_CTX_new"},
+    {ERR_FUNC(BN_F_BN_CTX_START), "BN_CTX_start"},
+    {ERR_FUNC(BN_F_BN_DIV), "BN_div"},
+    {ERR_FUNC(BN_F_BN_DIV_NO_BRANCH), "BN_div_no_branch"},
+    {ERR_FUNC(BN_F_BN_DIV_RECP), "BN_div_recp"},
+    {ERR_FUNC(BN_F_BN_EXP), "BN_exp"},
+    {ERR_FUNC(BN_F_BN_EXPAND2), "bn_expand2"},
+    {ERR_FUNC(BN_F_BN_EXPAND_INTERNAL), "BN_EXPAND_INTERNAL"},
+    {ERR_FUNC(BN_F_BN_GF2M_MOD), "BN_GF2m_mod"},
+    {ERR_FUNC(BN_F_BN_GF2M_MOD_EXP), "BN_GF2m_mod_exp"},
+    {ERR_FUNC(BN_F_BN_GF2M_MOD_MUL), "BN_GF2m_mod_mul"},
+    {ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD), "BN_GF2m_mod_solve_quad"},
+    {ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR), "BN_GF2m_mod_solve_quad_arr"},
+    {ERR_FUNC(BN_F_BN_GF2M_MOD_SQR), "BN_GF2m_mod_sqr"},
+    {ERR_FUNC(BN_F_BN_GF2M_MOD_SQRT), "BN_GF2m_mod_sqrt"},
+    {ERR_FUNC(BN_F_BN_LSHIFT), "BN_lshift"},
+    {ERR_FUNC(BN_F_BN_MOD_EXP2_MONT), "BN_mod_exp2_mont"},
+    {ERR_FUNC(BN_F_BN_MOD_EXP_MONT), "BN_mod_exp_mont"},
+    {ERR_FUNC(BN_F_BN_MOD_EXP_MONT_CONSTTIME), "BN_mod_exp_mont_consttime"},
+    {ERR_FUNC(BN_F_BN_MOD_EXP_MONT_WORD), "BN_mod_exp_mont_word"},
+    {ERR_FUNC(BN_F_BN_MOD_EXP_RECP), "BN_mod_exp_recp"},
+    {ERR_FUNC(BN_F_BN_MOD_EXP_SIMPLE), "BN_mod_exp_simple"},
+    {ERR_FUNC(BN_F_BN_MOD_INVERSE), "BN_mod_inverse"},
+    {ERR_FUNC(BN_F_BN_MOD_INVERSE_NO_BRANCH), "BN_mod_inverse_no_branch"},
+    {ERR_FUNC(BN_F_BN_MOD_LSHIFT_QUICK), "BN_mod_lshift_quick"},
+    {ERR_FUNC(BN_F_BN_MOD_MUL_RECIPROCAL), "BN_mod_mul_reciprocal"},
+    {ERR_FUNC(BN_F_BN_MOD_SQRT), "BN_mod_sqrt"},
+    {ERR_FUNC(BN_F_BN_MPI2BN), "BN_mpi2bn"},
+    {ERR_FUNC(BN_F_BN_NEW), "BN_new"},
+    {ERR_FUNC(BN_F_BN_RAND), "BN_rand"},
+    {ERR_FUNC(BN_F_BN_RAND_RANGE), "BN_rand_range"},
+    {ERR_FUNC(BN_F_BN_RSHIFT), "BN_rshift"},
+    {ERR_FUNC(BN_F_BN_USUB), "BN_usub"},
+    {0, NULL}
+};
+
+static ERR_STRING_DATA BN_str_reasons[] = {
+    {ERR_REASON(BN_R_ARG2_LT_ARG3), "arg2 lt arg3"},
+    {ERR_REASON(BN_R_BAD_RECIPROCAL), "bad reciprocal"},
+    {ERR_REASON(BN_R_BIGNUM_TOO_LONG), "bignum too long"},
+    {ERR_REASON(BN_R_BITS_TOO_SMALL), "bits too small"},
+    {ERR_REASON(BN_R_CALLED_WITH_EVEN_MODULUS), "called with even modulus"},
+    {ERR_REASON(BN_R_DIV_BY_ZERO), "div by zero"},
+    {ERR_REASON(BN_R_ENCODING_ERROR), "encoding error"},
+    {ERR_REASON(BN_R_EXPAND_ON_STATIC_BIGNUM_DATA),
+     "expand on static bignum data"},
+    {ERR_REASON(BN_R_INPUT_NOT_REDUCED), "input not reduced"},
+    {ERR_REASON(BN_R_INVALID_LENGTH), "invalid length"},
+    {ERR_REASON(BN_R_INVALID_RANGE), "invalid range"},
+    {ERR_REASON(BN_R_INVALID_SHIFT), "invalid shift"},
+    {ERR_REASON(BN_R_NOT_A_SQUARE), "not a square"},
+    {ERR_REASON(BN_R_NOT_INITIALIZED), "not initialized"},
+    {ERR_REASON(BN_R_NO_INVERSE), "no inverse"},
+    {ERR_REASON(BN_R_NO_SOLUTION), "no solution"},
+    {ERR_REASON(BN_R_P_IS_NOT_PRIME), "p is not prime"},
+    {ERR_REASON(BN_R_TOO_MANY_ITERATIONS), "too many iterations"},
+    {ERR_REASON(BN_R_TOO_MANY_TEMPORARY_VARIABLES),
+     "too many temporary variables"},
+    {0, NULL}
+};
+
+#endif
+
+void ERR_load_BN_strings(void)
+{
+#ifndef OPENSSL_NO_ERR
+
+    if (ERR_func_error_string(BN_str_functs[0].error) == NULL) {
+        ERR_load_strings(0, BN_str_functs);
+        ERR_load_strings(0, BN_str_reasons);
+    }
+#endif
+}
--- a/openssl-1.0.2f/crypto/bn/bn_err.o
+++ b/openssl-1.0.2f/crypto/bn/bn_err.o
--- a/openssl-1.0.2f/crypto/bn/bn_exp.c
+++ b/openssl-1.0.2f/crypto/bn/bn_exp.c
--- a/openssl-1.0.2f/crypto/bn/bn_exp.o
+++ b/openssl-1.0.2f/crypto/bn/bn_exp.o
--- a/openssl-1.0.2f/crypto/bn/bn_exp2.c
+++ b/openssl-1.0.2f/crypto/bn/bn_exp2.c
@@ -0,0 +1,303 @@
+/* crypto/bn/bn_exp2.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+#define TABLE_SIZE      32
+
+int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,
+                     const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
+                     BN_CTX *ctx, BN_MONT_CTX *in_mont)
+{
+    int i, j, bits, b, bits1, bits2, ret =
+        0, wpos1, wpos2, window1, window2, wvalue1, wvalue2;
+    int r_is_one = 1;
+    BIGNUM *d, *r;
+    const BIGNUM *a_mod_m;
+    /* Tables of variables obtained from 'ctx' */
+    BIGNUM *val1[TABLE_SIZE], *val2[TABLE_SIZE];
+    BN_MONT_CTX *mont = NULL;
+
+    bn_check_top(a1);
+    bn_check_top(p1);
+    bn_check_top(a2);
+    bn_check_top(p2);
+    bn_check_top(m);
+
+    if (!(m->d[0] & 1)) {
+        BNerr(BN_F_BN_MOD_EXP2_MONT, BN_R_CALLED_WITH_EVEN_MODULUS);
+        return (0);
+    }
+    bits1 = BN_num_bits(p1);
+    bits2 = BN_num_bits(p2);
+    if ((bits1 == 0) && (bits2 == 0)) {
+        ret = BN_one(rr);
+        return ret;
+    }
+
+    bits = (bits1 > bits2) ? bits1 : bits2;
+
+    BN_CTX_start(ctx);
+    d = BN_CTX_get(ctx);
+    r = BN_CTX_get(ctx);
+    val1[0] = BN_CTX_get(ctx);
+    val2[0] = BN_CTX_get(ctx);
+    if (!d || !r || !val1[0] || !val2[0])
+        goto err;
+
+    if (in_mont != NULL)
+        mont = in_mont;
+    else {
+        if ((mont = BN_MONT_CTX_new()) == NULL)
+            goto err;
+        if (!BN_MONT_CTX_set(mont, m, ctx))
+            goto err;
+    }
+
+    window1 = BN_window_bits_for_exponent_size(bits1);
+    window2 = BN_window_bits_for_exponent_size(bits2);
+
+    /*
+     * Build table for a1:   val1[i] := a1^(2*i + 1) mod m  for i = 0 .. 2^(window1-1)
+     */
+    if (a1->neg || BN_ucmp(a1, m) >= 0) {
+        if (!BN_mod(val1[0], a1, m, ctx))
+            goto err;
+        a_mod_m = val1[0];
+    } else
+        a_mod_m = a1;
+    if (BN_is_zero(a_mod_m)) {
+        BN_zero(rr);
+        ret = 1;
+        goto err;
+    }
+
+    if (!BN_to_montgomery(val1[0], a_mod_m, mont, ctx))
+        goto err;
+    if (window1 > 1) {
+        if (!BN_mod_mul_montgomery(d, val1[0], val1[0], mont, ctx))
+            goto err;
+
+        j = 1 << (window1 - 1);
+        for (i = 1; i < j; i++) {
+            if (((val1[i] = BN_CTX_get(ctx)) == NULL) ||
+                !BN_mod_mul_montgomery(val1[i], val1[i - 1], d, mont, ctx))
+                goto err;
+        }
+    }
+
+    /*
+     * Build table for a2:   val2[i] := a2^(2*i + 1) mod m  for i = 0 .. 2^(window2-1)
+     */
+    if (a2->neg || BN_ucmp(a2, m) >= 0) {
+        if (!BN_mod(val2[0], a2, m, ctx))
+            goto err;
+        a_mod_m = val2[0];
+    } else
+        a_mod_m = a2;
+    if (BN_is_zero(a_mod_m)) {
+        BN_zero(rr);
+        ret = 1;
+        goto err;
+    }
+    if (!BN_to_montgomery(val2[0], a_mod_m, mont, ctx))
+        goto err;
+    if (window2 > 1) {
+        if (!BN_mod_mul_montgomery(d, val2[0], val2[0], mont, ctx))
+            goto err;
+
+        j = 1 << (window2 - 1);
+        for (i = 1; i < j; i++) {
+            if (((val2[i] = BN_CTX_get(ctx)) == NULL) ||
+                !BN_mod_mul_montgomery(val2[i], val2[i - 1], d, mont, ctx))
+                goto err;
+        }
+    }
+
+    /* Now compute the power product, using independent windows. */
+    r_is_one = 1;
+    wvalue1 = 0;                /* The 'value' of the first window */
+    wvalue2 = 0;                /* The 'value' of the second window */
+    wpos1 = 0;                  /* If wvalue1 > 0, the bottom bit of the
+                                 * first window */
+    wpos2 = 0;                  /* If wvalue2 > 0, the bottom bit of the
+                                 * second window */
+
+    if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
+        goto err;
+    for (b = bits - 1; b >= 0; b--) {
+        if (!r_is_one) {
+            if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
+                goto err;
+        }
+
+        if (!wvalue1)
+            if (BN_is_bit_set(p1, b)) {
+                /*
+                 * consider bits b-window1+1 .. b for this window
+                 */
+                i = b - window1 + 1;
+                while (!BN_is_bit_set(p1, i)) /* works for i<0 */
+                    i++;
+                wpos1 = i;
+                wvalue1 = 1;
+                for (i = b - 1; i >= wpos1; i--) {
+                    wvalue1 <<= 1;
+                    if (BN_is_bit_set(p1, i))
+                        wvalue1++;
+                }
+            }
+
+        if (!wvalue2)
+            if (BN_is_bit_set(p2, b)) {
+                /*
+                 * consider bits b-window2+1 .. b for this window
+                 */
+                i = b - window2 + 1;
+                while (!BN_is_bit_set(p2, i))
+                    i++;
+                wpos2 = i;
+                wvalue2 = 1;
+                for (i = b - 1; i >= wpos2; i--) {
+                    wvalue2 <<= 1;
+                    if (BN_is_bit_set(p2, i))
+                        wvalue2++;
+                }
+            }
+
+        if (wvalue1 && b == wpos1) {
+            /* wvalue1 is odd and < 2^window1 */
+            if (!BN_mod_mul_montgomery(r, r, val1[wvalue1 >> 1], mont, ctx))
+                goto err;
+            wvalue1 = 0;
+            r_is_one = 0;
+        }
+
+        if (wvalue2 && b == wpos2) {
+            /* wvalue2 is odd and < 2^window2 */
+            if (!BN_mod_mul_montgomery(r, r, val2[wvalue2 >> 1], mont, ctx))
+                goto err;
+            wvalue2 = 0;
+            r_is_one = 0;
+        }
+    }
+    if (!BN_from_montgomery(rr, r, mont, ctx))
+        goto err;
+    ret = 1;
+ err:
+    if ((in_mont == NULL) && (mont != NULL))
+        BN_MONT_CTX_free(mont);
+    BN_CTX_end(ctx);
+    bn_check_top(rr);
+    return (ret);
+}
--- a/openssl-1.0.2f/crypto/bn/bn_exp2.o
+++ b/openssl-1.0.2f/crypto/bn/bn_exp2.o
--- a/openssl-1.0.2f/crypto/bn/bn_gcd.c
+++ b/openssl-1.0.2f/crypto/bn/bn_gcd.c
@@ -0,0 +1,702 @@
+/* crypto/bn/bn_gcd.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+static BIGNUM *euclid(BIGNUM *a, BIGNUM *b);
+
+int BN_gcd(BIGNUM *r, const BIGNUM *in_a, const BIGNUM *in_b, BN_CTX *ctx)
+{
+    BIGNUM *a, *b, *t;
+    int ret = 0;
+
+    bn_check_top(in_a);
+    bn_check_top(in_b);
+
+    BN_CTX_start(ctx);
+    a = BN_CTX_get(ctx);
+    b = BN_CTX_get(ctx);
+    if (a == NULL || b == NULL)
+        goto err;
+
+    if (BN_copy(a, in_a) == NULL)
+        goto err;
+    if (BN_copy(b, in_b) == NULL)
+        goto err;
+    a->neg = 0;
+    b->neg = 0;
+
+    if (BN_cmp(a, b) < 0) {
+        t = a;
+        a = b;
+        b = t;
+    }
+    t = euclid(a, b);
+    if (t == NULL)
+        goto err;
+
+    if (BN_copy(r, t) == NULL)
+        goto err;
+    ret = 1;
+ err:
+    BN_CTX_end(ctx);
+    bn_check_top(r);
+    return (ret);
+}
+
+static BIGNUM *euclid(BIGNUM *a, BIGNUM *b)
+{
+    BIGNUM *t;
+    int shifts = 0;
+
+    bn_check_top(a);
+    bn_check_top(b);
+
+    /* 0 <= b <= a */
+    while (!BN_is_zero(b)) {
+        /* 0 < b <= a */
+
+        if (BN_is_odd(a)) {
+            if (BN_is_odd(b)) {
+                if (!BN_sub(a, a, b))
+                    goto err;
+                if (!BN_rshift1(a, a))
+                    goto err;
+                if (BN_cmp(a, b) < 0) {
+                    t = a;
+                    a = b;
+                    b = t;
+                }
+            } else {            /* a odd - b even */
+
+                if (!BN_rshift1(b, b))
+                    goto err;
+                if (BN_cmp(a, b) < 0) {
+                    t = a;
+                    a = b;
+                    b = t;
+                }
+            }
+        } else {                /* a is even */
+
+            if (BN_is_odd(b)) {
+                if (!BN_rshift1(a, a))
+                    goto err;
+                if (BN_cmp(a, b) < 0) {
+                    t = a;
+                    a = b;
+                    b = t;
+                }
+            } else {            /* a even - b even */
+
+                if (!BN_rshift1(a, a))
+                    goto err;
+                if (!BN_rshift1(b, b))
+                    goto err;
+                shifts++;
+            }
+        }
+        /* 0 <= b <= a */
+    }
+
+    if (shifts) {
+        if (!BN_lshift(a, a, shifts))
+            goto err;
+    }
+    bn_check_top(a);
+    return (a);
+ err:
+    return (NULL);
+}
+
+/* solves ax == 1 (mod n) */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
+                                        const BIGNUM *a, const BIGNUM *n,
+                                        BN_CTX *ctx);
+
+BIGNUM *BN_mod_inverse(BIGNUM *in,
+                       const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
+{
+    BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
+    BIGNUM *ret = NULL;
+    int sign;
+
+    if ((BN_get_flags(a, BN_FLG_CONSTTIME) != 0)
+        || (BN_get_flags(n, BN_FLG_CONSTTIME) != 0)) {
+        return BN_mod_inverse_no_branch(in, a, n, ctx);
+    }
+
+    bn_check_top(a);
+    bn_check_top(n);
+
+    BN_CTX_start(ctx);
+    A = BN_CTX_get(ctx);
+    B = BN_CTX_get(ctx);
+    X = BN_CTX_get(ctx);
+    D = BN_CTX_get(ctx);
+    M = BN_CTX_get(ctx);
+    Y = BN_CTX_get(ctx);
+    T = BN_CTX_get(ctx);
+    if (T == NULL)
+        goto err;
+
+    if (in == NULL)
+        R = BN_new();
+    else
+        R = in;
+    if (R == NULL)
+        goto err;
+
+    BN_one(X);
+    BN_zero(Y);
+    if (BN_copy(B, a) == NULL)
+        goto err;
+    if (BN_copy(A, n) == NULL)
+        goto err;
+    A->neg = 0;
+    if (B->neg || (BN_ucmp(B, A) >= 0)) {
+        if (!BN_nnmod(B, B, A, ctx))
+            goto err;
+    }
+    sign = -1;
+    /*-
+     * From  B = a mod |n|,  A = |n|  it follows that
+     *
+     *      0 <= B < A,
+     *     -sign*X*a  ==  B   (mod |n|),
+     *      sign*Y*a  ==  A   (mod |n|).
+     */
+
+    if (BN_is_odd(n) && (BN_num_bits(n) <= (BN_BITS <= 32 ? 450 : 2048))) {
+        /*
+         * Binary inversion algorithm; requires odd modulus. This is faster
+         * than the general algorithm if the modulus is sufficiently small
+         * (about 400 .. 500 bits on 32-bit sytems, but much more on 64-bit
+         * systems)
+         */
+        int shift;
+
+        while (!BN_is_zero(B)) {
+            /*-
+             *      0 < B < |n|,
+             *      0 < A <= |n|,
+             * (1) -sign*X*a  ==  B   (mod |n|),
+             * (2)  sign*Y*a  ==  A   (mod |n|)
+             */
+
+            /*
+             * Now divide B by the maximum possible power of two in the
+             * integers, and divide X by the same value mod |n|. When we're
+             * done, (1) still holds.
+             */
+            shift = 0;
+            while (!BN_is_bit_set(B, shift)) { /* note that 0 < B */
+                shift++;
+
+                if (BN_is_odd(X)) {
+                    if (!BN_uadd(X, X, n))
+                        goto err;
+                }
+                /*
+                 * now X is even, so we can easily divide it by two
+                 */
+                if (!BN_rshift1(X, X))
+                    goto err;
+            }
+            if (shift > 0) {
+                if (!BN_rshift(B, B, shift))
+                    goto err;
+            }
+
+            /*
+             * Same for A and Y.  Afterwards, (2) still holds.
+             */
+            shift = 0;
+            while (!BN_is_bit_set(A, shift)) { /* note that 0 < A */
+                shift++;
+
+                if (BN_is_odd(Y)) {
+                    if (!BN_uadd(Y, Y, n))
+                        goto err;
+                }
+                /* now Y is even */
+                if (!BN_rshift1(Y, Y))
+                    goto err;
+            }
+            if (shift > 0) {
+                if (!BN_rshift(A, A, shift))
+                    goto err;
+            }
+
+            /*-
+             * We still have (1) and (2).
+             * Both  A  and  B  are odd.
+             * The following computations ensure that
+             *
+             *     0 <= B < |n|,
+             *      0 < A < |n|,
+             * (1) -sign*X*a  ==  B   (mod |n|),
+             * (2)  sign*Y*a  ==  A   (mod |n|),
+             *
+             * and that either  A  or  B  is even in the next iteration.
+             */
+            if (BN_ucmp(B, A) >= 0) {
+                /* -sign*(X + Y)*a == B - A  (mod |n|) */
+                if (!BN_uadd(X, X, Y))
+                    goto err;
+                /*
+                 * NB: we could use BN_mod_add_quick(X, X, Y, n), but that
+                 * actually makes the algorithm slower
+                 */
+                if (!BN_usub(B, B, A))
+                    goto err;
+            } else {
+                /*  sign*(X + Y)*a == A - B  (mod |n|) */
+                if (!BN_uadd(Y, Y, X))
+                    goto err;
+                /*
+                 * as above, BN_mod_add_quick(Y, Y, X, n) would slow things
+                 * down
+                 */
+                if (!BN_usub(A, A, B))
+                    goto err;
+            }
+        }
+    } else {
+        /* general inversion algorithm */
+
+        while (!BN_is_zero(B)) {
+            BIGNUM *tmp;
+
+            /*-
+             *      0 < B < A,
+             * (*) -sign*X*a  ==  B   (mod |n|),
+             *      sign*Y*a  ==  A   (mod |n|)
+             */
+
+            /* (D, M) := (A/B, A%B) ... */
+            if (BN_num_bits(A) == BN_num_bits(B)) {
+                if (!BN_one(D))
+                    goto err;
+                if (!BN_sub(M, A, B))
+                    goto err;
+            } else if (BN_num_bits(A) == BN_num_bits(B) + 1) {
+                /* A/B is 1, 2, or 3 */
+                if (!BN_lshift1(T, B))
+                    goto err;
+                if (BN_ucmp(A, T) < 0) {
+                    /* A < 2*B, so D=1 */
+                    if (!BN_one(D))
+                        goto err;
+                    if (!BN_sub(M, A, B))
+                        goto err;
+                } else {
+                    /* A >= 2*B, so D=2 or D=3 */
+                    if (!BN_sub(M, A, T))
+                        goto err;
+                    if (!BN_add(D, T, B))
+                        goto err; /* use D (:= 3*B) as temp */
+                    if (BN_ucmp(A, D) < 0) {
+                        /* A < 3*B, so D=2 */
+                        if (!BN_set_word(D, 2))
+                            goto err;
+                        /*
+                         * M (= A - 2*B) already has the correct value
+                         */
+                    } else {
+                        /* only D=3 remains */
+                        if (!BN_set_word(D, 3))
+                            goto err;
+                        /*
+                         * currently M = A - 2*B, but we need M = A - 3*B
+                         */
+                        if (!BN_sub(M, M, B))
+                            goto err;
+                    }
+                }
+            } else {
+                if (!BN_div(D, M, A, B, ctx))
+                    goto err;
+            }
+
+            /*-
+             * Now
+             *      A = D*B + M;
+             * thus we have
+             * (**)  sign*Y*a  ==  D*B + M   (mod |n|).
+             */
+
+            tmp = A;            /* keep the BIGNUM object, the value does not
+                                 * matter */
+
+            /* (A, B) := (B, A mod B) ... */
+            A = B;
+            B = M;
+            /* ... so we have  0 <= B < A  again */
+
+            /*-
+             * Since the former  M  is now  B  and the former  B  is now  A,
+             * (**) translates into
+             *       sign*Y*a  ==  D*A + B    (mod |n|),
+             * i.e.
+             *       sign*Y*a - D*A  ==  B    (mod |n|).
+             * Similarly, (*) translates into
+             *      -sign*X*a  ==  A          (mod |n|).
+             *
+             * Thus,
+             *   sign*Y*a + D*sign*X*a  ==  B  (mod |n|),
+             * i.e.
+             *        sign*(Y + D*X)*a  ==  B  (mod |n|).
+             *
+             * So if we set  (X, Y, sign) := (Y + D*X, X, -sign),  we arrive back at
+             *      -sign*X*a  ==  B   (mod |n|),
+             *       sign*Y*a  ==  A   (mod |n|).
+             * Note that  X  and  Y  stay non-negative all the time.
+             */
+
+            /*
+             * most of the time D is very small, so we can optimize tmp :=
+             * D*X+Y
+             */
+            if (BN_is_one(D)) {
+                if (!BN_add(tmp, X, Y))
+                    goto err;
+            } else {
+                if (BN_is_word(D, 2)) {
+                    if (!BN_lshift1(tmp, X))
+                        goto err;
+                } else if (BN_is_word(D, 4)) {
+                    if (!BN_lshift(tmp, X, 2))
+                        goto err;
+                } else if (D->top == 1) {
+                    if (!BN_copy(tmp, X))
+                        goto err;
+                    if (!BN_mul_word(tmp, D->d[0]))
+                        goto err;
+                } else {
+                    if (!BN_mul(tmp, D, X, ctx))
+                        goto err;
+                }
+                if (!BN_add(tmp, tmp, Y))
+                    goto err;
+            }
+
+            M = Y;              /* keep the BIGNUM object, the value does not
+                                 * matter */
+            Y = X;
+            X = tmp;
+            sign = -sign;
+        }
+    }
+
+    /*-
+     * The while loop (Euclid's algorithm) ends when
+     *      A == gcd(a,n);
+     * we have
+     *       sign*Y*a  ==  A  (mod |n|),
+     * where  Y  is non-negative.
+     */
+
+    if (sign < 0) {
+        if (!BN_sub(Y, n, Y))
+            goto err;
+    }
+    /* Now  Y*a  ==  A  (mod |n|).  */
+
+    if (BN_is_one(A)) {
+        /* Y*a == 1  (mod |n|) */
+        if (!Y->neg && BN_ucmp(Y, n) < 0) {
+            if (!BN_copy(R, Y))
+                goto err;
+        } else {
+            if (!BN_nnmod(R, Y, n, ctx))
+                goto err;
+        }
+    } else {
+        BNerr(BN_F_BN_MOD_INVERSE, BN_R_NO_INVERSE);
+        goto err;
+    }
+    ret = R;
+ err:
+    if ((ret == NULL) && (in == NULL))
+        BN_free(R);
+    BN_CTX_end(ctx);
+    bn_check_top(ret);
+    return (ret);
+}
+
+/*
+ * BN_mod_inverse_no_branch is a special version of BN_mod_inverse. It does
+ * not contain branches that may leak sensitive information.
+ */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
+                                        const BIGNUM *a, const BIGNUM *n,
+                                        BN_CTX *ctx)
+{
+    BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
+    BIGNUM local_A, local_B;
+    BIGNUM *pA, *pB;
+    BIGNUM *ret = NULL;
+    int sign;
+
+    bn_check_top(a);
+    bn_check_top(n);
+
+    BN_CTX_start(ctx);
+    A = BN_CTX_get(ctx);
+    B = BN_CTX_get(ctx);
+    X = BN_CTX_get(ctx);
+    D = BN_CTX_get(ctx);
+    M = BN_CTX_get(ctx);
+    Y = BN_CTX_get(ctx);
+    T = BN_CTX_get(ctx);
+    if (T == NULL)
+        goto err;
+
+    if (in == NULL)
+        R = BN_new();
+    else
+        R = in;
+    if (R == NULL)
+        goto err;
+
+    BN_one(X);
+    BN_zero(Y);
+    if (BN_copy(B, a) == NULL)
+        goto err;
+    if (BN_copy(A, n) == NULL)
+        goto err;
+    A->neg = 0;
+
+    if (B->neg || (BN_ucmp(B, A) >= 0)) {
+        /*
+         * Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+         * BN_div_no_branch will be called eventually.
+         */
+        pB = &local_B;
+        local_B.flags = 0;
+        BN_with_flags(pB, B, BN_FLG_CONSTTIME);
+        if (!BN_nnmod(B, pB, A, ctx))
+            goto err;
+    }
+    sign = -1;
+    /*-
+     * From  B = a mod |n|,  A = |n|  it follows that
+     *
+     *      0 <= B < A,
+     *     -sign*X*a  ==  B   (mod |n|),
+     *      sign*Y*a  ==  A   (mod |n|).
+     */
+
+    while (!BN_is_zero(B)) {
+        BIGNUM *tmp;
+
+        /*-
+         *      0 < B < A,
+         * (*) -sign*X*a  ==  B   (mod |n|),
+         *      sign*Y*a  ==  A   (mod |n|)
+         */
+
+        /*
+         * Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+         * BN_div_no_branch will be called eventually.
+         */
+        pA = &local_A;
+        local_A.flags = 0;
+        BN_with_flags(pA, A, BN_FLG_CONSTTIME);
+
+        /* (D, M) := (A/B, A%B) ... */
+        if (!BN_div(D, M, pA, B, ctx))
+            goto err;
+
+        /*-
+         * Now
+         *      A = D*B + M;
+         * thus we have
+         * (**)  sign*Y*a  ==  D*B + M   (mod |n|).
+         */
+
+        tmp = A;                /* keep the BIGNUM object, the value does not
+                                 * matter */
+
+        /* (A, B) := (B, A mod B) ... */
+        A = B;
+        B = M;
+        /* ... so we have  0 <= B < A  again */
+
+        /*-
+         * Since the former  M  is now  B  and the former  B  is now  A,
+         * (**) translates into
+         *       sign*Y*a  ==  D*A + B    (mod |n|),
+         * i.e.
+         *       sign*Y*a - D*A  ==  B    (mod |n|).
+         * Similarly, (*) translates into
+         *      -sign*X*a  ==  A          (mod |n|).
+         *
+         * Thus,
+         *   sign*Y*a + D*sign*X*a  ==  B  (mod |n|),
+         * i.e.
+         *        sign*(Y + D*X)*a  ==  B  (mod |n|).
+         *
+         * So if we set  (X, Y, sign) := (Y + D*X, X, -sign),  we arrive back at
+         *      -sign*X*a  ==  B   (mod |n|),
+         *       sign*Y*a  ==  A   (mod |n|).
+         * Note that  X  and  Y  stay non-negative all the time.
+         */
+
+        if (!BN_mul(tmp, D, X, ctx))
+            goto err;
+        if (!BN_add(tmp, tmp, Y))
+            goto err;
+
+        M = Y;                  /* keep the BIGNUM object, the value does not
+                                 * matter */
+        Y = X;
+        X = tmp;
+        sign = -sign;
+    }
+
+    /*-
+     * The while loop (Euclid's algorithm) ends when
+     *      A == gcd(a,n);
+     * we have
+     *       sign*Y*a  ==  A  (mod |n|),
+     * where  Y  is non-negative.
+     */
+
+    if (sign < 0) {
+        if (!BN_sub(Y, n, Y))
+            goto err;
+    }
+    /* Now  Y*a  ==  A  (mod |n|).  */
+
+    if (BN_is_one(A)) {
+        /* Y*a == 1  (mod |n|) */
+        if (!Y->neg && BN_ucmp(Y, n) < 0) {
+            if (!BN_copy(R, Y))
+                goto err;
+        } else {
+            if (!BN_nnmod(R, Y, n, ctx))
+                goto err;
+        }
+    } else {
+        BNerr(BN_F_BN_MOD_INVERSE_NO_BRANCH, BN_R_NO_INVERSE);
+        goto err;
+    }
+    ret = R;
+ err:
+    if ((ret == NULL) && (in == NULL))
+        BN_free(R);
+    BN_CTX_end(ctx);
+    bn_check_top(ret);
+    return (ret);
+}
--- a/openssl-1.0.2f/crypto/bn/bn_gcd.o
+++ b/openssl-1.0.2f/crypto/bn/bn_gcd.o
--- a/openssl-1.0.2f/crypto/bn/bn_gf2m.c
+++ b/openssl-1.0.2f/crypto/bn/bn_gf2m.c
--- a/openssl-1.0.2f/crypto/bn/bn_gf2m.o
+++ b/openssl-1.0.2f/crypto/bn/bn_gf2m.o
--- a/openssl-1.0.2f/crypto/bn/bn_kron.c
+++ b/openssl-1.0.2f/crypto/bn/bn_kron.c
@@ -0,0 +1,186 @@
+/* crypto/bn/bn_kron.c */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+/* least significant word */
+#define BN_lsw(n) (((n)->top == 0) ? (BN_ULONG) 0 : (n)->d[0])
+
+/* Returns -2 for errors because both -1 and 0 are valid results. */
+int BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
+{
+    int i;
+    int ret = -2;               /* avoid 'uninitialized' warning */
+    int err = 0;
+    BIGNUM *A, *B, *tmp;
+    /*-
+     * In 'tab', only odd-indexed entries are relevant:
+     * For any odd BIGNUM n,
+     *     tab[BN_lsw(n) & 7]
+     * is $(-1)^{(n^2-1)/8}$ (using TeX notation).
+     * Note that the sign of n does not matter.
+     */
+    static const int tab[8] = { 0, 1, 0, -1, 0, -1, 0, 1 };
+
+    bn_check_top(a);
+    bn_check_top(b);
+
+    BN_CTX_start(ctx);
+    A = BN_CTX_get(ctx);
+    B = BN_CTX_get(ctx);
+    if (B == NULL)
+        goto end;
+
+    err = !BN_copy(A, a);
+    if (err)
+        goto end;
+    err = !BN_copy(B, b);
+    if (err)
+        goto end;
+
+    /*
+     * Kronecker symbol, imlemented according to Henri Cohen,
+     * "A Course in Computational Algebraic Number Theory"
+     * (algorithm 1.4.10).
+     */
+
+    /* Cohen's step 1: */
+
+    if (BN_is_zero(B)) {
+        ret = BN_abs_is_word(A, 1);
+        goto end;
+    }
+
+    /* Cohen's step 2: */
+
+    if (!BN_is_odd(A) && !BN_is_odd(B)) {
+        ret = 0;
+        goto end;
+    }
+
+    /* now  B  is non-zero */
+    i = 0;
+    while (!BN_is_bit_set(B, i))
+        i++;
+    err = !BN_rshift(B, B, i);
+    if (err)
+        goto end;
+    if (i & 1) {
+        /* i is odd */
+        /* (thus  B  was even, thus  A  must be odd!)  */
+
+        /* set 'ret' to $(-1)^{(A^2-1)/8}$ */
+        ret = tab[BN_lsw(A) & 7];
+    } else {
+        /* i is even */
+        ret = 1;
+    }
+
+    if (B->neg) {
+        B->neg = 0;
+        if (A->neg)
+            ret = -ret;
+    }
+
+    /*
+     * now B is positive and odd, so what remains to be done is to compute
+     * the Jacobi symbol (A/B) and multiply it by 'ret'
+     */
+
+    while (1) {
+        /* Cohen's step 3: */
+
+        /*  B  is positive and odd */
+
+        if (BN_is_zero(A)) {
+            ret = BN_is_one(B) ? ret : 0;
+            goto end;
+        }
+
+        /* now  A  is non-zero */
+        i = 0;
+        while (!BN_is_bit_set(A, i))
+            i++;
+        err = !BN_rshift(A, A, i);
+        if (err)
+            goto end;
+        if (i & 1) {
+            /* i is odd */
+            /* multiply 'ret' by  $(-1)^{(B^2-1)/8}$ */
+            ret = ret * tab[BN_lsw(B) & 7];
+        }
+
+        /* Cohen's step 4: */
+        /* multiply 'ret' by  $(-1)^{(A-1)(B-1)/4}$ */
+        if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2)
+            ret = -ret;
+
+        /* (A, B) := (B mod |A|, |A|) */
+        err = !BN_nnmod(B, B, A, ctx);
+        if (err)
+            goto end;
+        tmp = A;
+        A = B;
+        B = tmp;
+        tmp->neg = 0;
+    }
+ end:
+    BN_CTX_end(ctx);
+    if (err)
+        return -2;
+    else
+        return ret;
+}
--- a/openssl-1.0.2f/crypto/bn/bn_kron.o
+++ b/openssl-1.0.2f/crypto/bn/bn_kron.o
--- a/openssl-1.0.2f/crypto/bn/bn_lcl.h
+++ b/openssl-1.0.2f/crypto/bn/bn_lcl.h
@@ -0,0 +1,537 @@
+/* crypto/bn/bn_lcl.h */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#ifndef HEADER_BN_LCL_H
+# define HEADER_BN_LCL_H
+
+# include <openssl/bn.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/*-
+ * BN_window_bits_for_exponent_size -- macro for sliding window mod_exp functions
+ *
+ *
+ * For window size 'w' (w >= 2) and a random 'b' bits exponent,
+ * the number of multiplications is a constant plus on average
+ *
+ *    2^(w-1) + (b-w)/(w+1);
+ *
+ * here  2^(w-1)  is for precomputing the table (we actually need
+ * entries only for windows that have the lowest bit set), and
+ * (b-w)/(w+1)  is an approximation for the expected number of
+ * w-bit windows, not counting the first one.
+ *
+ * Thus we should use
+ *
+ *    w >= 6  if        b > 671
+ *     w = 5  if  671 > b > 239
+ *     w = 4  if  239 > b >  79
+ *     w = 3  if   79 > b >  23
+ *    w <= 2  if   23 > b
+ *
+ * (with draws in between).  Very small exponents are often selected
+ * with low Hamming weight, so we use  w = 1  for b <= 23.
+ */
+# if 1
+#  define BN_window_bits_for_exponent_size(b) \
+                ((b) > 671 ? 6 : \
+                 (b) > 239 ? 5 : \
+                 (b) >  79 ? 4 : \
+                 (b) >  23 ? 3 : 1)
+# else
+/*
+ * Old SSLeay/OpenSSL table. Maximum window size was 5, so this table differs
+ * for b==1024; but it coincides for other interesting values (b==160,
+ * b==512).
+ */
+#  define BN_window_bits_for_exponent_size(b) \
+                ((b) > 255 ? 5 : \
+                 (b) > 127 ? 4 : \
+                 (b) >  17 ? 3 : 1)
+# endif
+
+/*
+ * BN_mod_exp_mont_conttime is based on the assumption that the L1 data cache
+ * line width of the target processor is at least the following value.
+ */
+# define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH      ( 64 )
+# define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK       (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
+
+/*
+ * Window sizes optimized for fixed window size modular exponentiation
+ * algorithm (BN_mod_exp_mont_consttime). To achieve the security goals of
+ * BN_mode_exp_mont_consttime, the maximum size of the window must not exceed
+ * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH). Window size thresholds are
+ * defined for cache line sizes of 32 and 64, cache line sizes where
+ * log_2(32)=5 and log_2(64)=6 respectively. A window size of 7 should only be
+ * used on processors that have a 128 byte or greater cache line size.
+ */
+# if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
+
+#  define BN_window_bits_for_ctime_exponent_size(b) \
+                ((b) > 937 ? 6 : \
+                 (b) > 306 ? 5 : \
+                 (b) >  89 ? 4 : \
+                 (b) >  22 ? 3 : 1)
+#  define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE    (6)
+
+# elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
+
+#  define BN_window_bits_for_ctime_exponent_size(b) \
+                ((b) > 306 ? 5 : \
+                 (b) >  89 ? 4 : \
+                 (b) >  22 ? 3 : 1)
+#  define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE    (5)
+
+# endif
+
+/* Pentium pro 16,16,16,32,64 */
+/* Alpha       16,16,16,16.64 */
+# define BN_MULL_SIZE_NORMAL                     (16)/* 32 */
+# define BN_MUL_RECURSIVE_SIZE_NORMAL            (16)/* 32 less than */
+# define BN_SQR_RECURSIVE_SIZE_NORMAL            (16)/* 32 */
+# define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL        (32)/* 32 */
+# define BN_MONT_CTX_SET_SIZE_WORD               (64)/* 32 */
+
+/*
+ * 2011-02-22 SMS. In various places, a size_t variable or a type cast to
+ * size_t was used to perform integer-only operations on pointers.  This
+ * failed on VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t
+ * is still only 32 bits.  What's needed in these cases is an integer type
+ * with the same size as a pointer, which size_t is not certain to be. The
+ * only fix here is VMS-specific.
+ */
+# if defined(OPENSSL_SYS_VMS)
+#  if __INITIAL_POINTER_SIZE == 64
+#   define PTR_SIZE_INT long long
+#  else                         /* __INITIAL_POINTER_SIZE == 64 */
+#   define PTR_SIZE_INT int
+#  endif                        /* __INITIAL_POINTER_SIZE == 64 [else] */
+# elif !defined(PTR_SIZE_INT)   /* defined(OPENSSL_SYS_VMS) */
+#  define PTR_SIZE_INT size_t
+# endif                         /* defined(OPENSSL_SYS_VMS) [else] */
+
+# if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
+/*
+ * BN_UMULT_HIGH section.
+ *
+ * No, I'm not trying to overwhelm you when stating that the
+ * product of N-bit numbers is 2*N bits wide:-) No, I don't expect
+ * you to be impressed when I say that if the compiler doesn't
+ * support 2*N integer type, then you have to replace every N*N
+ * multiplication with 4 (N/2)*(N/2) accompanied by some shifts
+ * and additions which unavoidably results in severe performance
+ * penalties. Of course provided that the hardware is capable of
+ * producing 2*N result... That's when you normally start
+ * considering assembler implementation. However! It should be
+ * pointed out that some CPUs (most notably Alpha, PowerPC and
+ * upcoming IA-64 family:-) provide *separate* instruction
+ * calculating the upper half of the product placing the result
+ * into a general purpose register. Now *if* the compiler supports
+ * inline assembler, then it's not impossible to implement the
+ * "bignum" routines (and have the compiler optimize 'em)
+ * exhibiting "native" performance in C. That's what BN_UMULT_HIGH
+ * macro is about:-)
+ *
+ *                                      <appro@fy.chalmers.se>
+ */
+#  if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
+#   if defined(__DECC)
+#    include <c_asm.h>
+#    define BN_UMULT_HIGH(a,b)   (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
+#   elif defined(__GNUC__) && __GNUC__>=2
+#    define BN_UMULT_HIGH(a,b)   ({      \
+        register BN_ULONG ret;          \
+        asm ("umulh     %1,%2,%0"       \
+             : "=r"(ret)                \
+             : "r"(a), "r"(b));         \
+        ret;                    })
+#   endif                       /* compiler */
+#  elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
+#   if defined(__GNUC__) && __GNUC__>=2
+#    define BN_UMULT_HIGH(a,b)   ({      \
+        register BN_ULONG ret;          \
+        asm ("mulhdu    %0,%1,%2"       \
+             : "=r"(ret)                \
+             : "r"(a), "r"(b));         \
+        ret;                    })
+#   endif                       /* compiler */
+#  elif (defined(__x86_64) || defined(__x86_64__)) && \
+       (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
+#   if defined(__GNUC__) && __GNUC__>=2
+#    define BN_UMULT_HIGH(a,b)   ({      \
+        register BN_ULONG ret,discard;  \
+        asm ("mulq      %3"             \
+             : "=a"(discard),"=d"(ret)  \
+             : "a"(a), "g"(b)           \
+             : "cc");                   \
+        ret;                    })
+#    define BN_UMULT_LOHI(low,high,a,b)  \
+        asm ("mulq      %3"             \
+                : "=a"(low),"=d"(high)  \
+                : "a"(a),"g"(b)         \
+                : "cc");
+#   endif
+#  elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
+#   if defined(_MSC_VER) && _MSC_VER>=1400
+unsigned __int64 __umulh(unsigned __int64 a, unsigned __int64 b);
+unsigned __int64 _umul128(unsigned __int64 a, unsigned __int64 b,
+                          unsigned __int64 *h);
+#    pragma intrinsic(__umulh,_umul128)
+#    define BN_UMULT_HIGH(a,b)           __umulh((a),(b))
+#    define BN_UMULT_LOHI(low,high,a,b)  ((low)=_umul128((a),(b),&(high)))
+#   endif
+#  elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+#   if defined(__GNUC__) && __GNUC__>=2
+#    if __GNUC__>4 || (__GNUC__>=4 && __GNUC_MINOR__>=4)
+                                     /* "h" constraint is no more since 4.4 */
+#     define BN_UMULT_HIGH(a,b)          (((__uint128_t)(a)*(b))>>64)
+#     define BN_UMULT_LOHI(low,high,a,b) ({     \
+        __uint128_t ret=(__uint128_t)(a)*(b);   \
+        (high)=ret>>64; (low)=ret;       })
+#    else
+#     define BN_UMULT_HIGH(a,b) ({      \
+        register BN_ULONG ret;          \
+        asm ("dmultu    %1,%2"          \
+             : "=h"(ret)                \
+             : "r"(a), "r"(b) : "l");   \
+        ret;                    })
+#     define BN_UMULT_LOHI(low,high,a,b)\
+        asm ("dmultu    %2,%3"          \
+             : "=l"(low),"=h"(high)     \
+             : "r"(a), "r"(b));
+#    endif
+#   endif
+#  elif defined(__aarch64__) && defined(SIXTY_FOUR_BIT_LONG)
+#   if defined(__GNUC__) && __GNUC__>=2
+#    define BN_UMULT_HIGH(a,b)   ({      \
+        register BN_ULONG ret;          \
+        asm ("umulh     %0,%1,%2"       \
+             : "=r"(ret)                \
+             : "r"(a), "r"(b));         \
+        ret;                    })
+#   endif
+#  endif                        /* cpu */
+# endif                         /* OPENSSL_NO_ASM */
+
+/*************************************************************
+ * Using the long long type
+ */
+# define Lw(t)    (((BN_ULONG)(t))&BN_MASK2)
+# define Hw(t)    (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
+
+# ifdef BN_DEBUG_RAND
+#  define bn_clear_top2max(a) \
+        { \
+        int      ind = (a)->dmax - (a)->top; \
+        BN_ULONG *ftl = &(a)->d[(a)->top-1]; \
+        for (; ind != 0; ind--) \
+                *(++ftl) = 0x0; \
+        }
+# else
+#  define bn_clear_top2max(a)
+# endif
+
+# ifdef BN_LLONG
+#  define mul_add(r,a,w,c) { \
+        BN_ULLONG t; \
+        t=(BN_ULLONG)w * (a) + (r) + (c); \
+        (r)= Lw(t); \
+        (c)= Hw(t); \
+        }
+
+#  define mul(r,a,w,c) { \
+        BN_ULLONG t; \
+        t=(BN_ULLONG)w * (a) + (c); \
+        (r)= Lw(t); \
+        (c)= Hw(t); \
+        }
+
+#  define sqr(r0,r1,a) { \
+        BN_ULLONG t; \
+        t=(BN_ULLONG)(a)*(a); \
+        (r0)=Lw(t); \
+        (r1)=Hw(t); \
+        }
+
+# elif defined(BN_UMULT_LOHI)
+#  define mul_add(r,a,w,c) {              \
+        BN_ULONG high,low,ret,tmp=(a);  \
+        ret =  (r);                     \
+        BN_UMULT_LOHI(low,high,w,tmp);  \
+        ret += (c);                     \
+        (c) =  (ret<(c))?1:0;           \
+        (c) += high;                    \
+        ret += low;                     \
+        (c) += (ret<low)?1:0;           \
+        (r) =  ret;                     \
+        }
+
+#  define mul(r,a,w,c)    {               \
+        BN_ULONG high,low,ret,ta=(a);   \
+        BN_UMULT_LOHI(low,high,w,ta);   \
+        ret =  low + (c);               \
+        (c) =  high;                    \
+        (c) += (ret<low)?1:0;           \
+        (r) =  ret;                     \
+        }
+
+#  define sqr(r0,r1,a)    {               \
+        BN_ULONG tmp=(a);               \
+        BN_UMULT_LOHI(r0,r1,tmp,tmp);   \
+        }
+
+# elif defined(BN_UMULT_HIGH)
+#  define mul_add(r,a,w,c) {              \
+        BN_ULONG high,low,ret,tmp=(a);  \
+        ret =  (r);                     \
+        high=  BN_UMULT_HIGH(w,tmp);    \
+        ret += (c);                     \
+        low =  (w) * tmp;               \
+        (c) =  (ret<(c))?1:0;           \
+        (c) += high;                    \
+        ret += low;                     \
+        (c) += (ret<low)?1:0;           \
+        (r) =  ret;                     \
+        }
+
+#  define mul(r,a,w,c)    {               \
+        BN_ULONG high,low,ret,ta=(a);   \
+        low =  (w) * ta;                \
+        high=  BN_UMULT_HIGH(w,ta);     \
+        ret =  low + (c);               \
+        (c) =  high;                    \
+        (c) += (ret<low)?1:0;           \
+        (r) =  ret;                     \
+        }
+
+#  define sqr(r0,r1,a)    {               \
+        BN_ULONG tmp=(a);               \
+        (r0) = tmp * tmp;               \
+        (r1) = BN_UMULT_HIGH(tmp,tmp);  \
+        }
+
+# else
+/*************************************************************
+ * No long long type
+ */
+
+#  define LBITS(a)        ((a)&BN_MASK2l)
+#  define HBITS(a)        (((a)>>BN_BITS4)&BN_MASK2l)
+#  define L2HBITS(a)      (((a)<<BN_BITS4)&BN_MASK2)
+
+#  define LLBITS(a)       ((a)&BN_MASKl)
+#  define LHBITS(a)       (((a)>>BN_BITS2)&BN_MASKl)
+#  define LL2HBITS(a)     ((BN_ULLONG)((a)&BN_MASKl)<<BN_BITS2)
+
+#  define mul64(l,h,bl,bh) \
+        { \
+        BN_ULONG m,m1,lt,ht; \
+ \
+        lt=l; \
+        ht=h; \
+        m =(bh)*(lt); \
+        lt=(bl)*(lt); \
+        m1=(bl)*(ht); \
+        ht =(bh)*(ht); \
+        m=(m+m1)&BN_MASK2; if (m < m1) ht+=L2HBITS((BN_ULONG)1); \
+        ht+=HBITS(m); \
+        m1=L2HBITS(m); \
+        lt=(lt+m1)&BN_MASK2; if (lt < m1) ht++; \
+        (l)=lt; \
+        (h)=ht; \
+        }
+
+#  define sqr64(lo,ho,in) \
+        { \
+        BN_ULONG l,h,m; \
+ \
+        h=(in); \
+        l=LBITS(h); \
+        h=HBITS(h); \
+        m =(l)*(h); \
+        l*=l; \
+        h*=h; \
+        h+=(m&BN_MASK2h1)>>(BN_BITS4-1); \
+        m =(m&BN_MASK2l)<<(BN_BITS4+1); \
+        l=(l+m)&BN_MASK2; if (l < m) h++; \
+        (lo)=l; \
+        (ho)=h; \
+        }
+
+#  define mul_add(r,a,bl,bh,c) { \
+        BN_ULONG l,h; \
+ \
+        h= (a); \
+        l=LBITS(h); \
+        h=HBITS(h); \
+        mul64(l,h,(bl),(bh)); \
+ \
+        /* non-multiply part */ \
+        l=(l+(c))&BN_MASK2; if (l < (c)) h++; \
+        (c)=(r); \
+        l=(l+(c))&BN_MASK2; if (l < (c)) h++; \
+        (c)=h&BN_MASK2; \
+        (r)=l; \
+        }
+
+#  define mul(r,a,bl,bh,c) { \
+        BN_ULONG l,h; \
+ \
+        h= (a); \
+        l=LBITS(h); \
+        h=HBITS(h); \
+        mul64(l,h,(bl),(bh)); \
+ \
+        /* non-multiply part */ \
+        l+=(c); if ((l&BN_MASK2) < (c)) h++; \
+        (c)=h&BN_MASK2; \
+        (r)=l&BN_MASK2; \
+        }
+# endif                         /* !BN_LLONG */
+
+# if defined(OPENSSL_DOING_MAKEDEPEND) && defined(OPENSSL_FIPS)
+#  undef bn_div_words
+# endif
+
+void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb);
+void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp);
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
+int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n);
+int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl);
+void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
+                      int dna, int dnb, BN_ULONG *t);
+void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b,
+                           int n, int tna, int tnb, BN_ULONG *t);
+void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t);
+void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n);
+void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
+                          BN_ULONG *t);
+void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2,
+                 BN_ULONG *t);
+BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                           int cl, int dl);
+BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                           int cl, int dl);
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                const BN_ULONG *np, const BN_ULONG *n0, int num);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
--- a/openssl-1.0.2f/crypto/bn/bn_lib.c
+++ b/openssl-1.0.2f/crypto/bn/bn_lib.c
@@ -0,0 +1,916 @@
+/* crypto/bn/bn_lib.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#ifndef BN_DEBUG
+# undef NDEBUG                  /* avoid conflicting definitions */
+# define NDEBUG
+#endif
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+const char BN_version[] = "Big Number" OPENSSL_VERSION_PTEXT;
+
+/* This stuff appears to be completely unused, so is deprecated */
+#ifndef OPENSSL_NO_DEPRECATED
+/*-
+ * For a 32 bit machine
+ * 2 -   4 ==  128
+ * 3 -   8 ==  256
+ * 4 -  16 ==  512
+ * 5 -  32 == 1024
+ * 6 -  64 == 2048
+ * 7 - 128 == 4096
+ * 8 - 256 == 8192
+ */
+static int bn_limit_bits = 0;
+static int bn_limit_num = 8;    /* (1<<bn_limit_bits) */
+static int bn_limit_bits_low = 0;
+static int bn_limit_num_low = 8; /* (1<<bn_limit_bits_low) */
+static int bn_limit_bits_high = 0;
+static int bn_limit_num_high = 8; /* (1<<bn_limit_bits_high) */
+static int bn_limit_bits_mont = 0;
+static int bn_limit_num_mont = 8; /* (1<<bn_limit_bits_mont) */
+
+void BN_set_params(int mult, int high, int low, int mont)
+{
+    if (mult >= 0) {
+        if (mult > (int)(sizeof(int) * 8) - 1)
+            mult = sizeof(int) * 8 - 1;
+        bn_limit_bits = mult;
+        bn_limit_num = 1 << mult;
+    }
+    if (high >= 0) {
+        if (high > (int)(sizeof(int) * 8) - 1)
+            high = sizeof(int) * 8 - 1;
+        bn_limit_bits_high = high;
+        bn_limit_num_high = 1 << high;
+    }
+    if (low >= 0) {
+        if (low > (int)(sizeof(int) * 8) - 1)
+            low = sizeof(int) * 8 - 1;
+        bn_limit_bits_low = low;
+        bn_limit_num_low = 1 << low;
+    }
+    if (mont >= 0) {
+        if (mont > (int)(sizeof(int) * 8) - 1)
+            mont = sizeof(int) * 8 - 1;
+        bn_limit_bits_mont = mont;
+        bn_limit_num_mont = 1 << mont;
+    }
+}
+
+int BN_get_params(int which)
+{
+    if (which == 0)
+        return (bn_limit_bits);
+    else if (which == 1)
+        return (bn_limit_bits_high);
+    else if (which == 2)
+        return (bn_limit_bits_low);
+    else if (which == 3)
+        return (bn_limit_bits_mont);
+    else
+        return (0);
+}
+#endif
+
+const BIGNUM *BN_value_one(void)
+{
+    static const BN_ULONG data_one = 1L;
+    static const BIGNUM const_one =
+        { (BN_ULONG *)&data_one, 1, 1, 0, BN_FLG_STATIC_DATA };
+
+    return (&const_one);
+}
+
+int BN_num_bits_word(BN_ULONG l)
+{
+    static const unsigned char bits[256] = {
+        0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    };
+
+#if defined(SIXTY_FOUR_BIT_LONG)
+    if (l & 0xffffffff00000000L) {
+        if (l & 0xffff000000000000L) {
+            if (l & 0xff00000000000000L) {
+                return (bits[(int)(l >> 56)] + 56);
+            } else
+                return (bits[(int)(l >> 48)] + 48);
+        } else {
+            if (l & 0x0000ff0000000000L) {
+                return (bits[(int)(l >> 40)] + 40);
+            } else
+                return (bits[(int)(l >> 32)] + 32);
+        }
+    } else
+#else
+# ifdef SIXTY_FOUR_BIT
+    if (l & 0xffffffff00000000LL) {
+        if (l & 0xffff000000000000LL) {
+            if (l & 0xff00000000000000LL) {
+                return (bits[(int)(l >> 56)] + 56);
+            } else
+                return (bits[(int)(l >> 48)] + 48);
+        } else {
+            if (l & 0x0000ff0000000000LL) {
+                return (bits[(int)(l >> 40)] + 40);
+            } else
+                return (bits[(int)(l >> 32)] + 32);
+        }
+    } else
+# endif
+#endif
+    {
+#if defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
+        if (l & 0xffff0000L) {
+            if (l & 0xff000000L)
+                return (bits[(int)(l >> 24L)] + 24);
+            else
+                return (bits[(int)(l >> 16L)] + 16);
+        } else
+#endif
+        {
+#if defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
+            if (l & 0xff00L)
+                return (bits[(int)(l >> 8)] + 8);
+            else
+#endif
+                return (bits[(int)(l)]);
+        }
+    }
+}
+
+int BN_num_bits(const BIGNUM *a)
+{
+    int i = a->top - 1;
+    bn_check_top(a);
+
+    if (BN_is_zero(a))
+        return 0;
+    return ((i * BN_BITS2) + BN_num_bits_word(a->d[i]));
+}
+
+void BN_clear_free(BIGNUM *a)
+{
+    int i;
+
+    if (a == NULL)
+        return;
+    bn_check_top(a);
+    if (a->d != NULL) {
+        OPENSSL_cleanse(a->d, a->dmax * sizeof(a->d[0]));
+        if (!(BN_get_flags(a, BN_FLG_STATIC_DATA)))
+            OPENSSL_free(a->d);
+    }
+    i = BN_get_flags(a, BN_FLG_MALLOCED);
+    OPENSSL_cleanse(a, sizeof(BIGNUM));
+    if (i)
+        OPENSSL_free(a);
+}
+
+void BN_free(BIGNUM *a)
+{
+    if (a == NULL)
+        return;
+    bn_check_top(a);
+    if ((a->d != NULL) && !(BN_get_flags(a, BN_FLG_STATIC_DATA)))
+        OPENSSL_free(a->d);
+    if (a->flags & BN_FLG_MALLOCED)
+        OPENSSL_free(a);
+    else {
+#ifndef OPENSSL_NO_DEPRECATED
+        a->flags |= BN_FLG_FREE;
+#endif
+        a->d = NULL;
+    }
+}
+
+void BN_init(BIGNUM *a)
+{
+    memset(a, 0, sizeof(BIGNUM));
+    bn_check_top(a);
+}
+
+BIGNUM *BN_new(void)
+{
+    BIGNUM *ret;
+
+    if ((ret = (BIGNUM *)OPENSSL_malloc(sizeof(BIGNUM))) == NULL) {
+        BNerr(BN_F_BN_NEW, ERR_R_MALLOC_FAILURE);
+        return (NULL);
+    }
+    ret->flags = BN_FLG_MALLOCED;
+    ret->top = 0;
+    ret->neg = 0;
+    ret->dmax = 0;
+    ret->d = NULL;
+    bn_check_top(ret);
+    return (ret);
+}
+
+/* This is used both by bn_expand2() and bn_dup_expand() */
+/* The caller MUST check that words > b->dmax before calling this */
+static BN_ULONG *bn_expand_internal(const BIGNUM *b, int words)
+{
+    BN_ULONG *A, *a = NULL;
+    const BN_ULONG *B;
+    int i;
+
+    bn_check_top(b);
+
+    if (words > (INT_MAX / (4 * BN_BITS2))) {
+        BNerr(BN_F_BN_EXPAND_INTERNAL, BN_R_BIGNUM_TOO_LONG);
+        return NULL;
+    }
+    if (BN_get_flags(b, BN_FLG_STATIC_DATA)) {
+        BNerr(BN_F_BN_EXPAND_INTERNAL, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA);
+        return (NULL);
+    }
+    a = A = (BN_ULONG *)OPENSSL_malloc(sizeof(BN_ULONG) * words);
+    if (A == NULL) {
+        BNerr(BN_F_BN_EXPAND_INTERNAL, ERR_R_MALLOC_FAILURE);
+        return (NULL);
+    }
+#ifdef PURIFY
+    /*
+     * Valgrind complains in BN_consttime_swap because we process the whole
+     * array even if it's not initialised yet. This doesn't matter in that
+     * function - what's important is constant time operation (we're not
+     * actually going to use the data)
+     */
+    memset(a, 0, sizeof(BN_ULONG) * words);
+#endif
+
+#if 1
+    B = b->d;
+    /* Check if the previous number needs to be copied */
+    if (B != NULL) {
+        for (i = b->top >> 2; i > 0; i--, A += 4, B += 4) {
+            /*
+             * The fact that the loop is unrolled
+             * 4-wise is a tribute to Intel. It's
+             * the one that doesn't have enough
+             * registers to accomodate more data.
+             * I'd unroll it 8-wise otherwise:-)
+             *
+             *              <appro@fy.chalmers.se>
+             */
+            BN_ULONG a0, a1, a2, a3;
+            a0 = B[0];
+            a1 = B[1];
+            a2 = B[2];
+            a3 = B[3];
+            A[0] = a0;
+            A[1] = a1;
+            A[2] = a2;
+            A[3] = a3;
+        }
+        /*
+         * workaround for ultrix cc: without 'case 0', the optimizer does
+         * the switch table by doing a=top&3; a--; goto jump_table[a];
+         * which fails for top== 0
+         */
+        switch (b->top & 3) {
+        case 3:
+            A[2] = B[2];
+        case 2:
+            A[1] = B[1];
+        case 1:
+            A[0] = B[0];
+        case 0:
+            ;
+        }
+    }
+#else
+    memset(A, 0, sizeof(BN_ULONG) * words);
+    memcpy(A, b->d, sizeof(b->d[0]) * b->top);
+#endif
+
+    return (a);
+}
+
+/*
+ * This is an internal function that can be used instead of bn_expand2() when
+ * there is a need to copy BIGNUMs instead of only expanding the data part,
+ * while still expanding them. Especially useful when needing to expand
+ * BIGNUMs that are declared 'const' and should therefore not be changed. The
+ * reason to use this instead of a BN_dup() followed by a bn_expand2() is
+ * memory allocation overhead.  A BN_dup() followed by a bn_expand2() will
+ * allocate new memory for the BIGNUM data twice, and free it once, while
+ * bn_dup_expand() makes sure allocation is made only once.
+ */
+
+#ifndef OPENSSL_NO_DEPRECATED
+BIGNUM *bn_dup_expand(const BIGNUM *b, int words)
+{
+    BIGNUM *r = NULL;
+
+    bn_check_top(b);
+
+    /*
+     * This function does not work if words <= b->dmax && top < words because
+     * BN_dup() does not preserve 'dmax'! (But bn_dup_expand() is not used
+     * anywhere yet.)
+     */
+
+    if (words > b->dmax) {
+        BN_ULONG *a = bn_expand_internal(b, words);
+
+        if (a) {
+            r = BN_new();
+            if (r) {
+                r->top = b->top;
+                r->dmax = words;
+                r->neg = b->neg;
+                r->d = a;
+            } else {
+                /* r == NULL, BN_new failure */
+                OPENSSL_free(a);
+            }
+        }
+        /*
+         * If a == NULL, there was an error in allocation in
+         * bn_expand_internal(), and NULL should be returned
+         */
+    } else {
+        r = BN_dup(b);
+    }
+
+    bn_check_top(r);
+    return r;
+}
+#endif
+
+/*
+ * This is an internal function that should not be used in applications. It
+ * ensures that 'b' has enough room for a 'words' word number and initialises
+ * any unused part of b->d with leading zeros. It is mostly used by the
+ * various BIGNUM routines. If there is an error, NULL is returned. If not,
+ * 'b' is returned.
+ */
+
+BIGNUM *bn_expand2(BIGNUM *b, int words)
+{
+    bn_check_top(b);
+
+    if (words > b->dmax) {
+        BN_ULONG *a = bn_expand_internal(b, words);
+        if (!a)
+            return NULL;
+        if (b->d)
+            OPENSSL_free(b->d);
+        b->d = a;
+        b->dmax = words;
+    }
+
+/* None of this should be necessary because of what b->top means! */
+#if 0
+    /*
+     * NB: bn_wexpand() calls this only if the BIGNUM really has to grow
+     */
+    if (b->top < b->dmax) {
+        int i;
+        BN_ULONG *A = &(b->d[b->top]);
+        for (i = (b->dmax - b->top) >> 3; i > 0; i--, A += 8) {
+            A[0] = 0;
+            A[1] = 0;
+            A[2] = 0;
+            A[3] = 0;
+            A[4] = 0;
+            A[5] = 0;
+            A[6] = 0;
+            A[7] = 0;
+        }
+        for (i = (b->dmax - b->top) & 7; i > 0; i--, A++)
+            A[0] = 0;
+        assert(A == &(b->d[b->dmax]));
+    }
+#endif
+    bn_check_top(b);
+    return b;
+}
+
+BIGNUM *BN_dup(const BIGNUM *a)
+{
+    BIGNUM *t;
+
+    if (a == NULL)
+        return NULL;
+    bn_check_top(a);
+
+    t = BN_new();
+    if (t == NULL)
+        return NULL;
+    if (!BN_copy(t, a)) {
+        BN_free(t);
+        return NULL;
+    }
+    bn_check_top(t);
+    return t;
+}
+
+BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b)
+{
+    int i;
+    BN_ULONG *A;
+    const BN_ULONG *B;
+
+    bn_check_top(b);
+
+    if (a == b)
+        return (a);
+    if (bn_wexpand(a, b->top) == NULL)
+        return (NULL);
+
+#if 1
+    A = a->d;
+    B = b->d;
+    for (i = b->top >> 2; i > 0; i--, A += 4, B += 4) {
+        BN_ULONG a0, a1, a2, a3;
+        a0 = B[0];
+        a1 = B[1];
+        a2 = B[2];
+        a3 = B[3];
+        A[0] = a0;
+        A[1] = a1;
+        A[2] = a2;
+        A[3] = a3;
+    }
+    /* ultrix cc workaround, see comments in bn_expand_internal */
+    switch (b->top & 3) {
+    case 3:
+        A[2] = B[2];
+    case 2:
+        A[1] = B[1];
+    case 1:
+        A[0] = B[0];
+    case 0:;
+    }
+#else
+    memcpy(a->d, b->d, sizeof(b->d[0]) * b->top);
+#endif
+
+    a->top = b->top;
+    a->neg = b->neg;
+    bn_check_top(a);
+    return (a);
+}
+
+void BN_swap(BIGNUM *a, BIGNUM *b)
+{
+    int flags_old_a, flags_old_b;
+    BN_ULONG *tmp_d;
+    int tmp_top, tmp_dmax, tmp_neg;
+
+    bn_check_top(a);
+    bn_check_top(b);
+
+    flags_old_a = a->flags;
+    flags_old_b = b->flags;
+
+    tmp_d = a->d;
+    tmp_top = a->top;
+    tmp_dmax = a->dmax;
+    tmp_neg = a->neg;
+
+    a->d = b->d;
+    a->top = b->top;
+    a->dmax = b->dmax;
+    a->neg = b->neg;
+
+    b->d = tmp_d;
+    b->top = tmp_top;
+    b->dmax = tmp_dmax;
+    b->neg = tmp_neg;
+
+    a->flags =
+        (flags_old_a & BN_FLG_MALLOCED) | (flags_old_b & BN_FLG_STATIC_DATA);
+    b->flags =
+        (flags_old_b & BN_FLG_MALLOCED) | (flags_old_a & BN_FLG_STATIC_DATA);
+    bn_check_top(a);
+    bn_check_top(b);
+}
+
+void BN_clear(BIGNUM *a)
+{
+    bn_check_top(a);
+    if (a->d != NULL)
+        memset(a->d, 0, a->dmax * sizeof(a->d[0]));
+    a->top = 0;
+    a->neg = 0;
+}
+
+BN_ULONG BN_get_word(const BIGNUM *a)
+{
+    if (a->top > 1)
+        return BN_MASK2;
+    else if (a->top == 1)
+        return a->d[0];
+    /* a->top == 0 */
+    return 0;
+}
+
+int BN_set_word(BIGNUM *a, BN_ULONG w)
+{
+    bn_check_top(a);
+    if (bn_expand(a, (int)sizeof(BN_ULONG) * 8) == NULL)
+        return (0);
+    a->neg = 0;
+    a->d[0] = w;
+    a->top = (w ? 1 : 0);
+    bn_check_top(a);
+    return (1);
+}
+
+BIGNUM *BN_bin2bn(const unsigned char *s, int len, BIGNUM *ret)
+{
+    unsigned int i, m;
+    unsigned int n;
+    BN_ULONG l;
+    BIGNUM *bn = NULL;
+
+    if (ret == NULL)
+        ret = bn = BN_new();
+    if (ret == NULL)
+        return (NULL);
+    bn_check_top(ret);
+    l = 0;
+    n = len;
+    if (n == 0) {
+        ret->top = 0;
+        return (ret);
+    }
+    i = ((n - 1) / BN_BYTES) + 1;
+    m = ((n - 1) % (BN_BYTES));
+    if (bn_wexpand(ret, (int)i) == NULL) {
+        if (bn)
+            BN_free(bn);
+        return NULL;
+    }
+    ret->top = i;
+    ret->neg = 0;
+    while (n--) {
+        l = (l << 8L) | *(s++);
+        if (m-- == 0) {
+            ret->d[--i] = l;
+            l = 0;
+            m = BN_BYTES - 1;
+        }
+    }
+    /*
+     * need to call this due to clear byte at top if avoiding having the top
+     * bit set (-ve number)
+     */
+    bn_correct_top(ret);
+    return (ret);
+}
+
+/* ignore negative */
+int BN_bn2bin(const BIGNUM *a, unsigned char *to)
+{
+    int n, i;
+    BN_ULONG l;
+
+    bn_check_top(a);
+    n = i = BN_num_bytes(a);
+    while (i--) {
+        l = a->d[i / BN_BYTES];
+        *(to++) = (unsigned char)(l >> (8 * (i % BN_BYTES))) & 0xff;
+    }
+    return (n);
+}
+
+int BN_ucmp(const BIGNUM *a, const BIGNUM *b)
+{
+    int i;
+    BN_ULONG t1, t2, *ap, *bp;
+
+    bn_check_top(a);
+    bn_check_top(b);
+
+    i = a->top - b->top;
+    if (i != 0)
+        return (i);
+    ap = a->d;
+    bp = b->d;
+    for (i = a->top - 1; i >= 0; i--) {
+        t1 = ap[i];
+        t2 = bp[i];
+        if (t1 != t2)
+            return ((t1 > t2) ? 1 : -1);
+    }
+    return (0);
+}
+
+int BN_cmp(const BIGNUM *a, const BIGNUM *b)
+{
+    int i;
+    int gt, lt;
+    BN_ULONG t1, t2;
+
+    if ((a == NULL) || (b == NULL)) {
+        if (a != NULL)
+            return (-1);
+        else if (b != NULL)
+            return (1);
+        else
+            return (0);
+    }
+
+    bn_check_top(a);
+    bn_check_top(b);
+
+    if (a->neg != b->neg) {
+        if (a->neg)
+            return (-1);
+        else
+            return (1);
+    }
+    if (a->neg == 0) {
+        gt = 1;
+        lt = -1;
+    } else {
+        gt = -1;
+        lt = 1;
+    }
+
+    if (a->top > b->top)
+        return (gt);
+    if (a->top < b->top)
+        return (lt);
+    for (i = a->top - 1; i >= 0; i--) {
+        t1 = a->d[i];
+        t2 = b->d[i];
+        if (t1 > t2)
+            return (gt);
+        if (t1 < t2)
+            return (lt);
+    }
+    return (0);
+}
+
+int BN_set_bit(BIGNUM *a, int n)
+{
+    int i, j, k;
+
+    if (n < 0)
+        return 0;
+
+    i = n / BN_BITS2;
+    j = n % BN_BITS2;
+    if (a->top <= i) {
+        if (bn_wexpand(a, i + 1) == NULL)
+            return (0);
+        for (k = a->top; k < i + 1; k++)
+            a->d[k] = 0;
+        a->top = i + 1;
+    }
+
+    a->d[i] |= (((BN_ULONG)1) << j);
+    bn_check_top(a);
+    return (1);
+}
+
+int BN_clear_bit(BIGNUM *a, int n)
+{
+    int i, j;
+
+    bn_check_top(a);
+    if (n < 0)
+        return 0;
+
+    i = n / BN_BITS2;
+    j = n % BN_BITS2;
+    if (a->top <= i)
+        return (0);
+
+    a->d[i] &= (~(((BN_ULONG)1) << j));
+    bn_correct_top(a);
+    return (1);
+}
+
+int BN_is_bit_set(const BIGNUM *a, int n)
+{
+    int i, j;
+
+    bn_check_top(a);
+    if (n < 0)
+        return 0;
+    i = n / BN_BITS2;
+    j = n % BN_BITS2;
+    if (a->top <= i)
+        return 0;
+    return (int)(((a->d[i]) >> j) & ((BN_ULONG)1));
+}
+
+int BN_mask_bits(BIGNUM *a, int n)
+{
+    int b, w;
+
+    bn_check_top(a);
+    if (n < 0)
+        return 0;
+
+    w = n / BN_BITS2;
+    b = n % BN_BITS2;
+    if (w >= a->top)
+        return 0;
+    if (b == 0)
+        a->top = w;
+    else {
+        a->top = w + 1;
+        a->d[w] &= ~(BN_MASK2 << b);
+    }
+    bn_correct_top(a);
+    return (1);
+}
+
+void BN_set_negative(BIGNUM *a, int b)
+{
+    if (b && !BN_is_zero(a))
+        a->neg = 1;
+    else
+        a->neg = 0;
+}
+
+int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n)
+{
+    int i;
+    BN_ULONG aa, bb;
+
+    aa = a[n - 1];
+    bb = b[n - 1];
+    if (aa != bb)
+        return ((aa > bb) ? 1 : -1);
+    for (i = n - 2; i >= 0; i--) {
+        aa = a[i];
+        bb = b[i];
+        if (aa != bb)
+            return ((aa > bb) ? 1 : -1);
+    }
+    return (0);
+}
+
+/*
+ * Here follows a specialised variants of bn_cmp_words().  It has the
+ * property of performing the operation on arrays of different sizes. The
+ * sizes of those arrays is expressed through cl, which is the common length
+ * ( basicall, min(len(a),len(b)) ), and dl, which is the delta between the
+ * two lengths, calculated as len(a)-len(b). All lengths are the number of
+ * BN_ULONGs...
+ */
+
+int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl)
+{
+    int n, i;
+    n = cl - 1;
+
+    if (dl < 0) {
+        for (i = dl; i < 0; i++) {
+            if (b[n - i] != 0)
+                return -1;      /* a < b */
+        }
+    }
+    if (dl > 0) {
+        for (i = dl; i > 0; i--) {
+            if (a[n + i] != 0)
+                return 1;       /* a > b */
+        }
+    }
+    return bn_cmp_words(a, b, cl);
+}
+
+/*
+ * Constant-time conditional swap of a and b.
+ * a and b are swapped if condition is not 0.  The code assumes that at most one bit of condition is set.
+ * nwords is the number of words to swap.  The code assumes that at least nwords are allocated in both a and b,
+ * and that no more than nwords are used by either a or b.
+ * a and b cannot be the same number
+ */
+void BN_consttime_swap(BN_ULONG condition, BIGNUM *a, BIGNUM *b, int nwords)
+{
+    BN_ULONG t;
+    int i;
+
+    bn_wcheck_size(a, nwords);
+    bn_wcheck_size(b, nwords);
+
+    assert(a != b);
+    assert((condition & (condition - 1)) == 0);
+    assert(sizeof(BN_ULONG) >= sizeof(int));
+
+    condition = ((condition - 1) >> (BN_BITS2 - 1)) - 1;
+
+    t = (a->top ^ b->top) & condition;
+    a->top ^= t;
+    b->top ^= t;
+
+#define BN_CONSTTIME_SWAP(ind) \
+        do { \
+                t = (a->d[ind] ^ b->d[ind]) & condition; \
+                a->d[ind] ^= t; \
+                b->d[ind] ^= t; \
+        } while (0)
+
+    switch (nwords) {
+    default:
+        for (i = 10; i < nwords; i++)
+            BN_CONSTTIME_SWAP(i);
+        /* Fallthrough */
+    case 10:
+        BN_CONSTTIME_SWAP(9);   /* Fallthrough */
+    case 9:
+        BN_CONSTTIME_SWAP(8);   /* Fallthrough */
+    case 8:
+        BN_CONSTTIME_SWAP(7);   /* Fallthrough */
+    case 7:
+        BN_CONSTTIME_SWAP(6);   /* Fallthrough */
+    case 6:
+        BN_CONSTTIME_SWAP(5);   /* Fallthrough */
+    case 5:
+        BN_CONSTTIME_SWAP(4);   /* Fallthrough */
+    case 4:
+        BN_CONSTTIME_SWAP(3);   /* Fallthrough */
+    case 3:
+        BN_CONSTTIME_SWAP(2);   /* Fallthrough */
+    case 2:
+        BN_CONSTTIME_SWAP(1);   /* Fallthrough */
+    case 1:
+        BN_CONSTTIME_SWAP(0);
+    }
+#undef BN_CONSTTIME_SWAP
+}
--- a/openssl-1.0.2f/crypto/bn/bn_lib.o
+++ b/openssl-1.0.2f/crypto/bn/bn_lib.o
--- a/openssl-1.0.2f/crypto/bn/bn_mod.c
+++ b/openssl-1.0.2f/crypto/bn/bn_mod.c
@@ -0,0 +1,316 @@
+/* crypto/bn/bn_mod.c */
+/*
+ * Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
+ * for the OpenSSL project.
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+#if 0                           /* now just a #define */
+int BN_mod(BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx)
+{
+    return (BN_div(NULL, rem, m, d, ctx));
+    /* note that  rem->neg == m->neg  (unless the remainder is zero) */
+}
+#endif
+
+int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx)
+{
+    /*
+     * like BN_mod, but returns non-negative remainder (i.e., 0 <= r < |d|
+     * always holds)
+     */
+
+    if (!(BN_mod(r, m, d, ctx)))
+        return 0;
+    if (!r->neg)
+        return 1;
+    /* now   -|d| < r < 0,  so we have to set  r := r + |d| */
+    return (d->neg ? BN_sub : BN_add) (r, r, d);
+}
+
+int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx)
+{
+    if (!BN_add(r, a, b))
+        return 0;
+    return BN_nnmod(r, r, m, ctx);
+}
+
+/*
+ * BN_mod_add variant that may be used if both a and b are non-negative and
+ * less than m
+ */
+int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                     const BIGNUM *m)
+{
+    if (!BN_uadd(r, a, b))
+        return 0;
+    if (BN_ucmp(r, m) >= 0)
+        return BN_usub(r, r, m);
+    return 1;
+}
+
+int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx)
+{
+    if (!BN_sub(r, a, b))
+        return 0;
+    return BN_nnmod(r, r, m, ctx);
+}
+
+/*
+ * BN_mod_sub variant that may be used if both a and b are non-negative and
+ * less than m
+ */
+int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                     const BIGNUM *m)
+{
+    if (!BN_sub(r, a, b))
+        return 0;
+    if (r->neg)
+        return BN_add(r, r, m);
+    return 1;
+}
+
+/* slow but works */
+int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx)
+{
+    BIGNUM *t;
+    int ret = 0;
+
+    bn_check_top(a);
+    bn_check_top(b);
+    bn_check_top(m);
+
+    BN_CTX_start(ctx);
+    if ((t = BN_CTX_get(ctx)) == NULL)
+        goto err;
+    if (a == b) {
+        if (!BN_sqr(t, a, ctx))
+            goto err;
+    } else {
+        if (!BN_mul(t, a, b, ctx))
+            goto err;
+    }
+    if (!BN_nnmod(r, t, m, ctx))
+        goto err;
+    bn_check_top(r);
+    ret = 1;
+ err:
+    BN_CTX_end(ctx);
+    return (ret);
+}
+
+int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
+{
+    if (!BN_sqr(r, a, ctx))
+        return 0;
+    /* r->neg == 0,  thus we don't need BN_nnmod */
+    return BN_mod(r, r, m, ctx);
+}
+
+int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
+{
+    if (!BN_lshift1(r, a))
+        return 0;
+    bn_check_top(r);
+    return BN_nnmod(r, r, m, ctx);
+}
+
+/*
+ * BN_mod_lshift1 variant that may be used if a is non-negative and less than
+ * m
+ */
+int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m)
+{
+    if (!BN_lshift1(r, a))
+        return 0;
+    bn_check_top(r);
+    if (BN_cmp(r, m) >= 0)
+        return BN_sub(r, r, m);
+    return 1;
+}
+
+int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                  BN_CTX *ctx)
+{
+    BIGNUM *abs_m = NULL;
+    int ret;
+
+    if (!BN_nnmod(r, a, m, ctx))
+        return 0;
+
+    if (m->neg) {
+        abs_m = BN_dup(m);
+        if (abs_m == NULL)
+            return 0;
+        abs_m->neg = 0;
+    }
+
+    ret = BN_mod_lshift_quick(r, r, n, (abs_m ? abs_m : m));
+    bn_check_top(r);
+
+    if (abs_m)
+        BN_free(abs_m);
+    return ret;
+}
+
+/*
+ * BN_mod_lshift variant that may be used if a is non-negative and less than
+ * m
+ */
+int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m)
+{
+    if (r != a) {
+        if (BN_copy(r, a) == NULL)
+            return 0;
+    }
+
+    while (n > 0) {
+        int max_shift;
+
+        /* 0 < r < m */
+        max_shift = BN_num_bits(m) - BN_num_bits(r);
+        /* max_shift >= 0 */
+
+        if (max_shift < 0) {
+            BNerr(BN_F_BN_MOD_LSHIFT_QUICK, BN_R_INPUT_NOT_REDUCED);
+            return 0;
+        }
+
+        if (max_shift > n)
+            max_shift = n;
+
+        if (max_shift) {
+            if (!BN_lshift(r, r, max_shift))
+                return 0;
+            n -= max_shift;
+        } else {
+            if (!BN_lshift1(r, r))
+                return 0;
+            --n;
+        }
+
+        /* BN_num_bits(r) <= BN_num_bits(m) */
+
+        if (BN_cmp(r, m) >= 0) {
+            if (!BN_sub(r, r, m))
+                return 0;
+        }
+    }
+    bn_check_top(r);
+
+    return 1;
+}
--- a/openssl-1.0.2f/crypto/bn/bn_mod.o
+++ b/openssl-1.0.2f/crypto/bn/bn_mod.o
--- a/openssl-1.0.2f/crypto/bn/bn_mont.c
+++ b/openssl-1.0.2f/crypto/bn/bn_mont.c
@@ -0,0 +1,558 @@
+/* crypto/bn/bn_mont.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+/*
+ * Details about Montgomery multiplication algorithms can be found at
+ * http://security.ece.orst.edu/publications.html, e.g.
+ * http://security.ece.orst.edu/koc/papers/j37acmon.pdf and
+ * sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+#define MONT_WORD               /* use the faster word-based algorithm */
+
+#ifdef MONT_WORD
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
+#endif
+
+int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                          BN_MONT_CTX *mont, BN_CTX *ctx)
+{
+    BIGNUM *tmp;
+    int ret = 0;
+#if defined(OPENSSL_BN_ASM_MONT) && defined(MONT_WORD)
+    int num = mont->N.top;
+
+    if (num > 1 && a->top == num && b->top == num) {
+        if (bn_wexpand(r, num) == NULL)
+            return (0);
+        if (bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
+            r->neg = a->neg ^ b->neg;
+            r->top = num;
+            bn_correct_top(r);
+            return (1);
+        }
+    }
+#endif
+
+    BN_CTX_start(ctx);
+    tmp = BN_CTX_get(ctx);
+    if (tmp == NULL)
+        goto err;
+
+    bn_check_top(tmp);
+    if (a == b) {
+        if (!BN_sqr(tmp, a, ctx))
+            goto err;
+    } else {
+        if (!BN_mul(tmp, a, b, ctx))
+            goto err;
+    }
+    /* reduce from aRR to aR */
+#ifdef MONT_WORD
+    if (!BN_from_montgomery_word(r, tmp, mont))
+        goto err;
+#else
+    if (!BN_from_montgomery(r, tmp, mont, ctx))
+        goto err;
+#endif
+    bn_check_top(r);
+    ret = 1;
+ err:
+    BN_CTX_end(ctx);
+    return (ret);
+}
+
+#ifdef MONT_WORD
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
+{
+    BIGNUM *n;
+    BN_ULONG *ap, *np, *rp, n0, v, carry;
+    int nl, max, i;
+
+    n = &(mont->N);
+    nl = n->top;
+    if (nl == 0) {
+        ret->top = 0;
+        return (1);
+    }
+
+    max = (2 * nl);             /* carry is stored separately */
+    if (bn_wexpand(r, max) == NULL)
+        return (0);
+
+    r->neg ^= n->neg;
+    np = n->d;
+    rp = r->d;
+
+    /* clear the top words of T */
+# if 1
+    for (i = r->top; i < max; i++) /* memset? XXX */
+        rp[i] = 0;
+# else
+    memset(&(rp[r->top]), 0, (max - r->top) * sizeof(BN_ULONG));
+# endif
+
+    r->top = max;
+    n0 = mont->n0[0];
+
+# ifdef BN_COUNT
+    fprintf(stderr, "word BN_from_montgomery_word %d * %d\n", nl, nl);
+# endif
+    for (carry = 0, i = 0; i < nl; i++, rp++) {
+# ifdef __TANDEM
+        {
+            long long t1;
+            long long t2;
+            long long t3;
+            t1 = rp[0] * (n0 & 0177777);
+            t2 = 037777600000l;
+            t2 = n0 & t2;
+            t3 = rp[0] & 0177777;
+            t2 = (t3 * t2) & BN_MASK2;
+            t1 = t1 + t2;
+            v = bn_mul_add_words(rp, np, nl, (BN_ULONG)t1);
+        }
+# else
+        v = bn_mul_add_words(rp, np, nl, (rp[0] * n0) & BN_MASK2);
+# endif
+        v = (v + carry + rp[nl]) & BN_MASK2;
+        carry |= (v != rp[nl]);
+        carry &= (v <= rp[nl]);
+        rp[nl] = v;
+    }
+
+    if (bn_wexpand(ret, nl) == NULL)
+        return (0);
+    ret->top = nl;
+    ret->neg = r->neg;
+
+    rp = ret->d;
+    ap = &(r->d[nl]);
+
+# define BRANCH_FREE 1
+# if BRANCH_FREE
+    {
+        BN_ULONG *nrp;
+        size_t m;
+
+        v = bn_sub_words(rp, ap, np, nl) - carry;
+        /*
+         * if subtraction result is real, then trick unconditional memcpy
+         * below to perform in-place "refresh" instead of actual copy.
+         */
+        m = (0 - (size_t)v);
+        nrp =
+            (BN_ULONG *)(((PTR_SIZE_INT) rp & ~m) | ((PTR_SIZE_INT) ap & m));
+
+        for (i = 0, nl -= 4; i < nl; i += 4) {
+            BN_ULONG t1, t2, t3, t4;
+
+            t1 = nrp[i + 0];
+            t2 = nrp[i + 1];
+            t3 = nrp[i + 2];
+            ap[i + 0] = 0;
+            t4 = nrp[i + 3];
+            ap[i + 1] = 0;
+            rp[i + 0] = t1;
+            ap[i + 2] = 0;
+            rp[i + 1] = t2;
+            ap[i + 3] = 0;
+            rp[i + 2] = t3;
+            rp[i + 3] = t4;
+        }
+        for (nl += 4; i < nl; i++)
+            rp[i] = nrp[i], ap[i] = 0;
+    }
+# else
+    if (bn_sub_words(rp, ap, np, nl) - carry)
+        memcpy(rp, ap, nl * sizeof(BN_ULONG));
+# endif
+    bn_correct_top(r);
+    bn_correct_top(ret);
+    bn_check_top(ret);
+
+    return (1);
+}
+#endif                          /* MONT_WORD */
+
+int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
+                       BN_CTX *ctx)
+{
+    int retn = 0;
+#ifdef MONT_WORD
+    BIGNUM *t;
+
+    BN_CTX_start(ctx);
+    if ((t = BN_CTX_get(ctx)) && BN_copy(t, a))
+        retn = BN_from_montgomery_word(ret, t, mont);
+    BN_CTX_end(ctx);
+#else                           /* !MONT_WORD */
+    BIGNUM *t1, *t2;
+
+    BN_CTX_start(ctx);
+    t1 = BN_CTX_get(ctx);
+    t2 = BN_CTX_get(ctx);
+    if (t1 == NULL || t2 == NULL)
+        goto err;
+
+    if (!BN_copy(t1, a))
+        goto err;
+    BN_mask_bits(t1, mont->ri);
+
+    if (!BN_mul(t2, t1, &mont->Ni, ctx))
+        goto err;
+    BN_mask_bits(t2, mont->ri);
+
+    if (!BN_mul(t1, t2, &mont->N, ctx))
+        goto err;
+    if (!BN_add(t2, a, t1))
+        goto err;
+    if (!BN_rshift(ret, t2, mont->ri))
+        goto err;
+
+    if (BN_ucmp(ret, &(mont->N)) >= 0) {
+        if (!BN_usub(ret, ret, &(mont->N)))
+            goto err;
+    }
+    retn = 1;
+    bn_check_top(ret);
+ err:
+    BN_CTX_end(ctx);
+#endif                          /* MONT_WORD */
+    return (retn);
+}
+
+BN_MONT_CTX *BN_MONT_CTX_new(void)
+{
+    BN_MONT_CTX *ret;
+
+    if ((ret = (BN_MONT_CTX *)OPENSSL_malloc(sizeof(BN_MONT_CTX))) == NULL)
+        return (NULL);
+
+    BN_MONT_CTX_init(ret);
+    ret->flags = BN_FLG_MALLOCED;
+    return (ret);
+}
+
+void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
+{
+    ctx->ri = 0;
+    BN_init(&(ctx->RR));
+    BN_init(&(ctx->N));
+    BN_init(&(ctx->Ni));
+    ctx->n0[0] = ctx->n0[1] = 0;
+    ctx->flags = 0;
+}
+
+void BN_MONT_CTX_free(BN_MONT_CTX *mont)
+{
+    if (mont == NULL)
+        return;
+
+    BN_clear_free(&(mont->RR));
+    BN_clear_free(&(mont->N));
+    BN_clear_free(&(mont->Ni));
+    if (mont->flags & BN_FLG_MALLOCED)
+        OPENSSL_free(mont);
+}
+
+int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
+{
+    int ret = 0;
+    BIGNUM *Ri, *R;
+
+    if (BN_is_zero(mod))
+        return 0;
+
+    BN_CTX_start(ctx);
+    if ((Ri = BN_CTX_get(ctx)) == NULL)
+        goto err;
+    R = &(mont->RR);            /* grab RR as a temp */
+    if (!BN_copy(&(mont->N), mod))
+        goto err;               /* Set N */
+    mont->N.neg = 0;
+
+#ifdef MONT_WORD
+    {
+        BIGNUM tmod;
+        BN_ULONG buf[2];
+
+        BN_init(&tmod);
+        tmod.d = buf;
+        tmod.dmax = 2;
+        tmod.neg = 0;
+
+        mont->ri = (BN_num_bits(mod) + (BN_BITS2 - 1)) / BN_BITS2 * BN_BITS2;
+
+# if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+        /*
+         * Only certain BN_BITS2<=32 platforms actually make use of n0[1],
+         * and we could use the #else case (with a shorter R value) for the
+         * others.  However, currently only the assembler files do know which
+         * is which.
+         */
+
+        BN_zero(R);
+        if (!(BN_set_bit(R, 2 * BN_BITS2)))
+            goto err;
+
+        tmod.top = 0;
+        if ((buf[0] = mod->d[0]))
+            tmod.top = 1;
+        if ((buf[1] = mod->top > 1 ? mod->d[1] : 0))
+            tmod.top = 2;
+
+        if ((BN_mod_inverse(Ri, R, &tmod, ctx)) == NULL)
+            goto err;
+        if (!BN_lshift(Ri, Ri, 2 * BN_BITS2))
+            goto err;           /* R*Ri */
+        if (!BN_is_zero(Ri)) {
+            if (!BN_sub_word(Ri, 1))
+                goto err;
+        } else {                /* if N mod word size == 1 */
+
+            if (bn_expand(Ri, (int)sizeof(BN_ULONG) * 2) == NULL)
+                goto err;
+            /* Ri-- (mod double word size) */
+            Ri->neg = 0;
+            Ri->d[0] = BN_MASK2;
+            Ri->d[1] = BN_MASK2;
+            Ri->top = 2;
+        }
+        if (!BN_div(Ri, NULL, Ri, &tmod, ctx))
+            goto err;
+        /*
+         * Ni = (R*Ri-1)/N, keep only couple of least significant words:
+         */
+        mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+        mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
+# else
+        BN_zero(R);
+        if (!(BN_set_bit(R, BN_BITS2)))
+            goto err;           /* R */
+
+        buf[0] = mod->d[0];     /* tmod = N mod word size */
+        buf[1] = 0;
+        tmod.top = buf[0] != 0 ? 1 : 0;
+        /* Ri = R^-1 mod N */
+        if ((BN_mod_inverse(Ri, R, &tmod, ctx)) == NULL)
+            goto err;
+        if (!BN_lshift(Ri, Ri, BN_BITS2))
+            goto err;           /* R*Ri */
+        if (!BN_is_zero(Ri)) {
+            if (!BN_sub_word(Ri, 1))
+                goto err;
+        } else {                /* if N mod word size == 1 */
+
+            if (!BN_set_word(Ri, BN_MASK2))
+                goto err;       /* Ri-- (mod word size) */
+        }
+        if (!BN_div(Ri, NULL, Ri, &tmod, ctx))
+            goto err;
+        /*
+         * Ni = (R*Ri-1)/N, keep only least significant word:
+         */
+        mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+        mont->n0[1] = 0;
+# endif
+    }
+#else                           /* !MONT_WORD */
+    {                           /* bignum version */
+        mont->ri = BN_num_bits(&mont->N);
+        BN_zero(R);
+        if (!BN_set_bit(R, mont->ri))
+            goto err;           /* R = 2^ri */
+        /* Ri = R^-1 mod N */
+        if ((BN_mod_inverse(Ri, R, &mont->N, ctx)) == NULL)
+            goto err;
+        if (!BN_lshift(Ri, Ri, mont->ri))
+            goto err;           /* R*Ri */
+        if (!BN_sub_word(Ri, 1))
+            goto err;
+        /*
+         * Ni = (R*Ri-1) / N
+         */
+        if (!BN_div(&(mont->Ni), NULL, Ri, &mont->N, ctx))
+            goto err;
+    }
+#endif
+
+    /* setup RR for conversions */
+    BN_zero(&(mont->RR));
+    if (!BN_set_bit(&(mont->RR), mont->ri * 2))
+        goto err;
+    if (!BN_mod(&(mont->RR), &(mont->RR), &(mont->N), ctx))
+        goto err;
+
+    ret = 1;
+ err:
+    BN_CTX_end(ctx);
+    return ret;
+}
+
+BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
+{
+    if (to == from)
+        return (to);
+
+    if (!BN_copy(&(to->RR), &(from->RR)))
+        return NULL;
+    if (!BN_copy(&(to->N), &(from->N)))
+        return NULL;
+    if (!BN_copy(&(to->Ni), &(from->Ni)))
+        return NULL;
+    to->ri = from->ri;
+    to->n0[0] = from->n0[0];
+    to->n0[1] = from->n0[1];
+    return (to);
+}
+
+BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
+                                    const BIGNUM *mod, BN_CTX *ctx)
+{
+    BN_MONT_CTX *ret;
+
+    CRYPTO_r_lock(lock);
+    ret = *pmont;
+    CRYPTO_r_unlock(lock);
+    if (ret)
+        return ret;
+
+    /*
+     * We don't want to serialise globally while doing our lazy-init math in
+     * BN_MONT_CTX_set. That punishes threads that are doing independent
+     * things. Instead, punish the case where more than one thread tries to
+     * lazy-init the same 'pmont', by having each do the lazy-init math work
+     * independently and only use the one from the thread that wins the race
+     * (the losers throw away the work they've done).
+     */
+    ret = BN_MONT_CTX_new();
+    if (!ret)
+        return NULL;
+    if (!BN_MONT_CTX_set(ret, mod, ctx)) {
+        BN_MONT_CTX_free(ret);
+        return NULL;
+    }
+
+    /* The locked compare-and-set, after the local work is done. */
+    CRYPTO_w_lock(lock);
+    if (*pmont) {
+        BN_MONT_CTX_free(ret);
+        ret = *pmont;
+    } else
+        *pmont = ret;
+    CRYPTO_w_unlock(lock);
+    return ret;
+}
--- a/openssl-1.0.2f/crypto/bn/bn_mont.o
+++ b/openssl-1.0.2f/crypto/bn/bn_mont.o
--- a/openssl-1.0.2f/crypto/bn/bn_mpi.c
+++ b/openssl-1.0.2f/crypto/bn/bn_mpi.c
@@ -0,0 +1,128 @@
+/* crypto/bn/bn_mpi.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+int BN_bn2mpi(const BIGNUM *a, unsigned char *d)
+{
+    int bits;
+    int num = 0;
+    int ext = 0;
+    long l;
+
+    bits = BN_num_bits(a);
+    num = (bits + 7) / 8;
+    if (bits > 0) {
+        ext = ((bits & 0x07) == 0);
+    }
+    if (d == NULL)
+        return (num + 4 + ext);
+
+    l = num + ext;
+    d[0] = (unsigned char)(l >> 24) & 0xff;
+    d[1] = (unsigned char)(l >> 16) & 0xff;
+    d[2] = (unsigned char)(l >> 8) & 0xff;
+    d[3] = (unsigned char)(l) & 0xff;
+    if (ext)
+        d[4] = 0;
+    num = BN_bn2bin(a, &(d[4 + ext]));
+    if (a->neg)
+        d[4] |= 0x80;
+    return (num + 4 + ext);
+}
+
+BIGNUM *BN_mpi2bn(const unsigned char *d, int n, BIGNUM *a)
+{
+    long len;
+    int neg = 0;
+
+    if (n < 4) {
+        BNerr(BN_F_BN_MPI2BN, BN_R_INVALID_LENGTH);
+        return (NULL);
+    }
+    len = ((long)d[0] << 24) | ((long)d[1] << 16) | ((int)d[2] << 8) | (int)
+        d[3];
+    if ((len + 4) != n) {
+        BNerr(BN_F_BN_MPI2BN, BN_R_ENCODING_ERROR);
+        return (NULL);
+    }
+
+    if (a == NULL)
+        a = BN_new();
+    if (a == NULL)
+        return (NULL);
+
+    if (len == 0) {
+        a->neg = 0;
+        a->top = 0;
+        return (a);
+    }
+    d += 4;
+    if ((*d) & 0x80)
+        neg = 1;
+    if (BN_bin2bn(d, (int)len, a) == NULL)
+        return (NULL);
+    a->neg = neg;
+    if (neg) {
+        BN_clear_bit(a, BN_num_bits(a) - 1);
+    }
+    bn_check_top(a);
+    return (a);
+}
--- a/openssl-1.0.2f/crypto/bn/bn_mpi.o
+++ b/openssl-1.0.2f/crypto/bn/bn_mpi.o
--- a/openssl-1.0.2f/crypto/bn/bn_mul.c
+++ b/openssl-1.0.2f/crypto/bn/bn_mul.c
--- a/openssl-1.0.2f/crypto/bn/bn_mul.o
+++ b/openssl-1.0.2f/crypto/bn/bn_mul.o
--- a/openssl-1.0.2f/crypto/bn/bn_nist.c
+++ b/openssl-1.0.2f/crypto/bn/bn_nist.c
--- a/openssl-1.0.2f/crypto/bn/bn_nist.o
+++ b/openssl-1.0.2f/crypto/bn/bn_nist.o
--- a/openssl-1.0.2f/crypto/bn/bn_prime.c
+++ b/openssl-1.0.2f/crypto/bn/bn_prime.c
@@ -0,0 +1,515 @@
+/* crypto/bn/bn_prime.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+#include <openssl/rand.h>
+
+/*
+ * NB: these functions have been "upgraded", the deprecated versions (which
+ * are compatibility wrappers using these functions) are in bn_depr.c. -
+ * Geoff
+ */
+
+/*
+ * The quick sieve algorithm approach to weeding out primes is Philip
+ * Zimmermann's, as implemented in PGP.  I have had a read of his comments
+ * and implemented my own version.
+ */
+#include "bn_prime.h"
+
+static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
+                   const BIGNUM *a1_odd, int k, BN_CTX *ctx,
+                   BN_MONT_CTX *mont);
+static int probable_prime(BIGNUM *rnd, int bits);
+static int probable_prime_dh(BIGNUM *rnd, int bits,
+                             const BIGNUM *add, const BIGNUM *rem,
+                             BN_CTX *ctx);
+static int probable_prime_dh_safe(BIGNUM *rnd, int bits, const BIGNUM *add,
+                                  const BIGNUM *rem, BN_CTX *ctx);
+
+int BN_GENCB_call(BN_GENCB *cb, int a, int b)
+{
+    /* No callback means continue */
+    if (!cb)
+        return 1;
+    switch (cb->ver) {
+    case 1:
+        /* Deprecated-style callbacks */
+        if (!cb->cb.cb_1)
+            return 1;
+        cb->cb.cb_1(a, b, cb->arg);
+        return 1;
+    case 2:
+        /* New-style callbacks */
+        return cb->cb.cb_2(a, b, cb);
+    default:
+        break;
+    }
+    /* Unrecognised callback type */
+    return 0;
+}
+
+int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe,
+                         const BIGNUM *add, const BIGNUM *rem, BN_GENCB *cb)
+{
+    BIGNUM *t;
+    int found = 0;
+    int i, j, c1 = 0;
+    BN_CTX *ctx;
+    int checks = BN_prime_checks_for_size(bits);
+
+    ctx = BN_CTX_new();
+    if (ctx == NULL)
+        goto err;
+    BN_CTX_start(ctx);
+    t = BN_CTX_get(ctx);
+    if (!t)
+        goto err;
+ loop:
+    /* make a random number and set the top and bottom bits */
+    if (add == NULL) {
+        if (!probable_prime(ret, bits))
+            goto err;
+    } else {
+        if (safe) {
+            if (!probable_prime_dh_safe(ret, bits, add, rem, ctx))
+                goto err;
+        } else {
+            if (!probable_prime_dh(ret, bits, add, rem, ctx))
+                goto err;
+        }
+    }
+    /* if (BN_mod_word(ret,(BN_ULONG)3) == 1) goto loop; */
+    if (!BN_GENCB_call(cb, 0, c1++))
+        /* aborted */
+        goto err;
+
+    if (!safe) {
+        i = BN_is_prime_fasttest_ex(ret, checks, ctx, 0, cb);
+        if (i == -1)
+            goto err;
+        if (i == 0)
+            goto loop;
+    } else {
+        /*
+         * for "safe prime" generation, check that (p-1)/2 is prime. Since a
+         * prime is odd, We just need to divide by 2
+         */
+        if (!BN_rshift1(t, ret))
+            goto err;
+
+        for (i = 0; i < checks; i++) {
+            j = BN_is_prime_fasttest_ex(ret, 1, ctx, 0, cb);
+            if (j == -1)
+                goto err;
+            if (j == 0)
+                goto loop;
+
+            j = BN_is_prime_fasttest_ex(t, 1, ctx, 0, cb);
+            if (j == -1)
+                goto err;
+            if (j == 0)
+                goto loop;
+
+            if (!BN_GENCB_call(cb, 2, c1 - 1))
+                goto err;
+            /* We have a safe prime test pass */
+        }
+    }
+    /* we have a prime :-) */
+    found = 1;
+ err:
+    if (ctx != NULL) {
+        BN_CTX_end(ctx);
+        BN_CTX_free(ctx);
+    }
+    bn_check_top(ret);
+    return found;
+}
+
+int BN_is_prime_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
+                   BN_GENCB *cb)
+{
+    return BN_is_prime_fasttest_ex(a, checks, ctx_passed, 0, cb);
+}
+
+int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
+                            int do_trial_division, BN_GENCB *cb)
+{
+    int i, j, ret = -1;
+    int k;
+    BN_CTX *ctx = NULL;
+    BIGNUM *A1, *A1_odd, *check; /* taken from ctx */
+    BN_MONT_CTX *mont = NULL;
+    const BIGNUM *A = NULL;
+
+    if (BN_cmp(a, BN_value_one()) <= 0)
+        return 0;
+
+    if (checks == BN_prime_checks)
+        checks = BN_prime_checks_for_size(BN_num_bits(a));
+
+    /* first look for small factors */
+    if (!BN_is_odd(a))
+        /* a is even => a is prime if and only if a == 2 */
+        return BN_is_word(a, 2);
+    if (do_trial_division) {
+        for (i = 1; i < NUMPRIMES; i++)
+            if (BN_mod_word(a, primes[i]) == 0)
+                return 0;
+        if (!BN_GENCB_call(cb, 1, -1))
+            goto err;
+    }
+
+    if (ctx_passed != NULL)
+        ctx = ctx_passed;
+    else if ((ctx = BN_CTX_new()) == NULL)
+        goto err;
+    BN_CTX_start(ctx);
+
+    /* A := abs(a) */
+    if (a->neg) {
+        BIGNUM *t;
+        if ((t = BN_CTX_get(ctx)) == NULL)
+            goto err;
+        BN_copy(t, a);
+        t->neg = 0;
+        A = t;
+    } else
+        A = a;
+    A1 = BN_CTX_get(ctx);
+    A1_odd = BN_CTX_get(ctx);
+    check = BN_CTX_get(ctx);
+    if (check == NULL)
+        goto err;
+
+    /* compute A1 := A - 1 */
+    if (!BN_copy(A1, A))
+        goto err;
+    if (!BN_sub_word(A1, 1))
+        goto err;
+    if (BN_is_zero(A1)) {
+        ret = 0;
+        goto err;
+    }
+
+    /* write  A1  as  A1_odd * 2^k */
+    k = 1;
+    while (!BN_is_bit_set(A1, k))
+        k++;
+    if (!BN_rshift(A1_odd, A1, k))
+        goto err;
+
+    /* Montgomery setup for computations mod A */
+    mont = BN_MONT_CTX_new();
+    if (mont == NULL)
+        goto err;
+    if (!BN_MONT_CTX_set(mont, A, ctx))
+        goto err;
+
+    for (i = 0; i < checks; i++) {
+        if (!BN_pseudo_rand_range(check, A1))
+            goto err;
+        if (!BN_add_word(check, 1))
+            goto err;
+        /* now 1 <= check < A */
+
+        j = witness(check, A, A1, A1_odd, k, ctx, mont);
+        if (j == -1)
+            goto err;
+        if (j) {
+            ret = 0;
+            goto err;
+        }
+        if (!BN_GENCB_call(cb, 1, i))
+            goto err;
+    }
+    ret = 1;
+ err:
+    if (ctx != NULL) {
+        BN_CTX_end(ctx);
+        if (ctx_passed == NULL)
+            BN_CTX_free(ctx);
+    }
+    if (mont != NULL)
+        BN_MONT_CTX_free(mont);
+
+    return (ret);
+}
+
+static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
+                   const BIGNUM *a1_odd, int k, BN_CTX *ctx,
+                   BN_MONT_CTX *mont)
+{
+    if (!BN_mod_exp_mont(w, w, a1_odd, a, ctx, mont)) /* w := w^a1_odd mod a */
+        return -1;
+    if (BN_is_one(w))
+        return 0;               /* probably prime */
+    if (BN_cmp(w, a1) == 0)
+        return 0;               /* w == -1 (mod a), 'a' is probably prime */
+    while (--k) {
+        if (!BN_mod_mul(w, w, w, a, ctx)) /* w := w^2 mod a */
+            return -1;
+        if (BN_is_one(w))
+            return 1;           /* 'a' is composite, otherwise a previous 'w'
+                                 * would have been == -1 (mod 'a') */
+        if (BN_cmp(w, a1) == 0)
+            return 0;           /* w == -1 (mod a), 'a' is probably prime */
+    }
+    /*
+     * If we get here, 'w' is the (a-1)/2-th power of the original 'w', and
+     * it is neither -1 nor +1 -- so 'a' cannot be prime
+     */
+    bn_check_top(w);
+    return 1;
+}
+
+static int probable_prime(BIGNUM *rnd, int bits)
+{
+    int i;
+    prime_t mods[NUMPRIMES];
+    BN_ULONG delta, maxdelta;
+
+ again:
+    if (!BN_rand(rnd, bits, 1, 1))
+        return (0);
+    /* we now have a random number 'rand' to test. */
+    for (i = 1; i < NUMPRIMES; i++)
+        mods[i] = (prime_t) BN_mod_word(rnd, (BN_ULONG)primes[i]);
+    maxdelta = BN_MASK2 - primes[NUMPRIMES - 1];
+    delta = 0;
+ loop:for (i = 1; i < NUMPRIMES; i++) {
+        /*
+         * check that rnd is not a prime and also that gcd(rnd-1,primes) == 1
+         * (except for 2)
+         */
+        if (((mods[i] + delta) % primes[i]) <= 1) {
+            delta += 2;
+            if (delta > maxdelta)
+                goto again;
+            goto loop;
+        }
+    }
+    if (!BN_add_word(rnd, delta))
+        return (0);
+    bn_check_top(rnd);
+    return (1);
+}
+
+static int probable_prime_dh(BIGNUM *rnd, int bits,
+                             const BIGNUM *add, const BIGNUM *rem,
+                             BN_CTX *ctx)
+{
+    int i, ret = 0;
+    BIGNUM *t1;
+
+    BN_CTX_start(ctx);
+    if ((t1 = BN_CTX_get(ctx)) == NULL)
+        goto err;
+
+    if (!BN_rand(rnd, bits, 0, 1))
+        goto err;
+
+    /* we need ((rnd-rem) % add) == 0 */
+
+    if (!BN_mod(t1, rnd, add, ctx))
+        goto err;
+    if (!BN_sub(rnd, rnd, t1))
+        goto err;
+    if (rem == NULL) {
+        if (!BN_add_word(rnd, 1))
+            goto err;
+    } else {
+        if (!BN_add(rnd, rnd, rem))
+            goto err;
+    }
+
+    /* we now have a random number 'rand' to test. */
+
+ loop:for (i = 1; i < NUMPRIMES; i++) {
+        /* check that rnd is a prime */
+        if (BN_mod_word(rnd, (BN_ULONG)primes[i]) <= 1) {
+            if (!BN_add(rnd, rnd, add))
+                goto err;
+            goto loop;
+        }
+    }
+    ret = 1;
+ err:
+    BN_CTX_end(ctx);
+    bn_check_top(rnd);
+    return (ret);
+}
+
+static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd,
+                                  const BIGNUM *rem, BN_CTX *ctx)
+{
+    int i, ret = 0;
+    BIGNUM *t1, *qadd, *q;
+
+    bits--;
+    BN_CTX_start(ctx);
+    t1 = BN_CTX_get(ctx);
+    q = BN_CTX_get(ctx);
+    qadd = BN_CTX_get(ctx);
+    if (qadd == NULL)
+        goto err;
+
+    if (!BN_rshift1(qadd, padd))
+        goto err;
+
+    if (!BN_rand(q, bits, 0, 1))
+        goto err;
+
+    /* we need ((rnd-rem) % add) == 0 */
+    if (!BN_mod(t1, q, qadd, ctx))
+        goto err;
+    if (!BN_sub(q, q, t1))
+        goto err;
+    if (rem == NULL) {
+        if (!BN_add_word(q, 1))
+            goto err;
+    } else {
+        if (!BN_rshift1(t1, rem))
+            goto err;
+        if (!BN_add(q, q, t1))
+            goto err;
+    }
+
+    /* we now have a random number 'rand' to test. */
+    if (!BN_lshift1(p, q))
+        goto err;
+    if (!BN_add_word(p, 1))
+        goto err;
+
+ loop:for (i = 1; i < NUMPRIMES; i++) {
+        /* check that p and q are prime */
+        /*
+         * check that for p and q gcd(p-1,primes) == 1 (except for 2)
+         */
+        if ((BN_mod_word(p, (BN_ULONG)primes[i]) == 0) ||
+            (BN_mod_word(q, (BN_ULONG)primes[i]) == 0)) {
+            if (!BN_add(p, p, padd))
+                goto err;
+            if (!BN_add(q, q, qadd))
+                goto err;
+            goto loop;
+        }
+    }
+    ret = 1;
+ err:
+    BN_CTX_end(ctx);
+    bn_check_top(p);
+    return (ret);
+}
--- a/openssl-1.0.2f/crypto/bn/bn_prime.h
+++ b/openssl-1.0.2f/crypto/bn/bn_prime.h
@@ -0,0 +1,326 @@
+/* Auto generated by bn_prime.pl */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#ifndef EIGHT_BIT
+# define NUMPRIMES 2048
+typedef unsigned short prime_t;
+#else
+# define NUMPRIMES 54
+typedef unsigned char prime_t;
+#endif
+static const prime_t primes[NUMPRIMES] = {
+    2, 3, 5, 7, 11, 13, 17, 19,
+    23, 29, 31, 37, 41, 43, 47, 53,
+    59, 61, 67, 71, 73, 79, 83, 89,
+    97, 101, 103, 107, 109, 113, 127, 131,
+    137, 139, 149, 151, 157, 163, 167, 173,
+    179, 181, 191, 193, 197, 199, 211, 223,
+    227, 229, 233, 239, 241, 251,
+#ifndef EIGHT_BIT
+    257, 263,
+    269, 271, 277, 281, 283, 293, 307, 311,
+    313, 317, 331, 337, 347, 349, 353, 359,
+    367, 373, 379, 383, 389, 397, 401, 409,
+    419, 421, 431, 433, 439, 443, 449, 457,
+    461, 463, 467, 479, 487, 491, 499, 503,
+    509, 521, 523, 541, 547, 557, 563, 569,
+    571, 577, 587, 593, 599, 601, 607, 613,
+    617, 619, 631, 641, 643, 647, 653, 659,
+    661, 673, 677, 683, 691, 701, 709, 719,
+    727, 733, 739, 743, 751, 757, 761, 769,
+    773, 787, 797, 809, 811, 821, 823, 827,
+    829, 839, 853, 857, 859, 863, 877, 881,
+    883, 887, 907, 911, 919, 929, 937, 941,
+    947, 953, 967, 971, 977, 983, 991, 997,
+    1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
+    1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
+    1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,
+    1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
+    1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
+    1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
+    1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,
+    1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
+    1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
+    1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
+    1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619,
+    1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693,
+    1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747,
+    1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811,
+    1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877,
+    1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949,
+    1951, 1973, 1979, 1987, 1993, 1997, 1999, 2003,
+    2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069,
+    2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129,
+    2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203,
+    2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267,
+    2269, 2273, 2281, 2287, 2293, 2297, 2309, 2311,
+    2333, 2339, 2341, 2347, 2351, 2357, 2371, 2377,
+    2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
+    2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503,
+    2521, 2531, 2539, 2543, 2549, 2551, 2557, 2579,
+    2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657,
+    2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693,
+    2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741,
+    2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801,
+    2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861,
+    2879, 2887, 2897, 2903, 2909, 2917, 2927, 2939,
+    2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011,
+    3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
+    3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167,
+    3169, 3181, 3187, 3191, 3203, 3209, 3217, 3221,
+    3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301,
+    3307, 3313, 3319, 3323, 3329, 3331, 3343, 3347,
+    3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
+    3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491,
+    3499, 3511, 3517, 3527, 3529, 3533, 3539, 3541,
+    3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607,
+    3613, 3617, 3623, 3631, 3637, 3643, 3659, 3671,
+    3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727,
+    3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797,
+    3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863,
+    3877, 3881, 3889, 3907, 3911, 3917, 3919, 3923,
+    3929, 3931, 3943, 3947, 3967, 3989, 4001, 4003,
+    4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057,
+    4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129,
+    4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211,
+    4217, 4219, 4229, 4231, 4241, 4243, 4253, 4259,
+    4261, 4271, 4273, 4283, 4289, 4297, 4327, 4337,
+    4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
+    4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481,
+    4483, 4493, 4507, 4513, 4517, 4519, 4523, 4547,
+    4549, 4561, 4567, 4583, 4591, 4597, 4603, 4621,
+    4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673,
+    4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751,
+    4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813,
+    4817, 4831, 4861, 4871, 4877, 4889, 4903, 4909,
+    4919, 4931, 4933, 4937, 4943, 4951, 4957, 4967,
+    4969, 4973, 4987, 4993, 4999, 5003, 5009, 5011,
+    5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087,
+    5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167,
+    5171, 5179, 5189, 5197, 5209, 5227, 5231, 5233,
+    5237, 5261, 5273, 5279, 5281, 5297, 5303, 5309,
+    5323, 5333, 5347, 5351, 5381, 5387, 5393, 5399,
+    5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
+    5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507,
+    5519, 5521, 5527, 5531, 5557, 5563, 5569, 5573,
+    5581, 5591, 5623, 5639, 5641, 5647, 5651, 5653,
+    5657, 5659, 5669, 5683, 5689, 5693, 5701, 5711,
+    5717, 5737, 5741, 5743, 5749, 5779, 5783, 5791,
+    5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849,
+    5851, 5857, 5861, 5867, 5869, 5879, 5881, 5897,
+    5903, 5923, 5927, 5939, 5953, 5981, 5987, 6007,
+    6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073,
+    6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133,
+    6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211,
+    6217, 6221, 6229, 6247, 6257, 6263, 6269, 6271,
+    6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329,
+    6337, 6343, 6353, 6359, 6361, 6367, 6373, 6379,
+    6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
+    6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563,
+    6569, 6571, 6577, 6581, 6599, 6607, 6619, 6637,
+    6653, 6659, 6661, 6673, 6679, 6689, 6691, 6701,
+    6703, 6709, 6719, 6733, 6737, 6761, 6763, 6779,
+    6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833,
+    6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907,
+    6911, 6917, 6947, 6949, 6959, 6961, 6967, 6971,
+    6977, 6983, 6991, 6997, 7001, 7013, 7019, 7027,
+    7039, 7043, 7057, 7069, 7079, 7103, 7109, 7121,
+    7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207,
+    7211, 7213, 7219, 7229, 7237, 7243, 7247, 7253,
+    7283, 7297, 7307, 7309, 7321, 7331, 7333, 7349,
+    7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457,
+    7459, 7477, 7481, 7487, 7489, 7499, 7507, 7517,
+    7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
+    7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621,
+    7639, 7643, 7649, 7669, 7673, 7681, 7687, 7691,
+    7699, 7703, 7717, 7723, 7727, 7741, 7753, 7757,
+    7759, 7789, 7793, 7817, 7823, 7829, 7841, 7853,
+    7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919,
+    7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009,
+    8011, 8017, 8039, 8053, 8059, 8069, 8081, 8087,
+    8089, 8093, 8101, 8111, 8117, 8123, 8147, 8161,
+    8167, 8171, 8179, 8191, 8209, 8219, 8221, 8231,
+    8233, 8237, 8243, 8263, 8269, 8273, 8287, 8291,
+    8293, 8297, 8311, 8317, 8329, 8353, 8363, 8369,
+    8377, 8387, 8389, 8419, 8423, 8429, 8431, 8443,
+    8447, 8461, 8467, 8501, 8513, 8521, 8527, 8537,
+    8539, 8543, 8563, 8573, 8581, 8597, 8599, 8609,
+    8623, 8627, 8629, 8641, 8647, 8663, 8669, 8677,
+    8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731,
+    8737, 8741, 8747, 8753, 8761, 8779, 8783, 8803,
+    8807, 8819, 8821, 8831, 8837, 8839, 8849, 8861,
+    8863, 8867, 8887, 8893, 8923, 8929, 8933, 8941,
+    8951, 8963, 8969, 8971, 8999, 9001, 9007, 9011,
+    9013, 9029, 9041, 9043, 9049, 9059, 9067, 9091,
+    9103, 9109, 9127, 9133, 9137, 9151, 9157, 9161,
+    9173, 9181, 9187, 9199, 9203, 9209, 9221, 9227,
+    9239, 9241, 9257, 9277, 9281, 9283, 9293, 9311,
+    9319, 9323, 9337, 9341, 9343, 9349, 9371, 9377,
+    9391, 9397, 9403, 9413, 9419, 9421, 9431, 9433,
+    9437, 9439, 9461, 9463, 9467, 9473, 9479, 9491,
+    9497, 9511, 9521, 9533, 9539, 9547, 9551, 9587,
+    9601, 9613, 9619, 9623, 9629, 9631, 9643, 9649,
+    9661, 9677, 9679, 9689, 9697, 9719, 9721, 9733,
+    9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791,
+    9803, 9811, 9817, 9829, 9833, 9839, 9851, 9857,
+    9859, 9871, 9883, 9887, 9901, 9907, 9923, 9929,
+    9931, 9941, 9949, 9967, 9973, 10007, 10009, 10037,
+    10039, 10061, 10067, 10069, 10079, 10091, 10093, 10099,
+    10103, 10111, 10133, 10139, 10141, 10151, 10159, 10163,
+    10169, 10177, 10181, 10193, 10211, 10223, 10243, 10247,
+    10253, 10259, 10267, 10271, 10273, 10289, 10301, 10303,
+    10313, 10321, 10331, 10333, 10337, 10343, 10357, 10369,
+    10391, 10399, 10427, 10429, 10433, 10453, 10457, 10459,
+    10463, 10477, 10487, 10499, 10501, 10513, 10529, 10531,
+    10559, 10567, 10589, 10597, 10601, 10607, 10613, 10627,
+    10631, 10639, 10651, 10657, 10663, 10667, 10687, 10691,
+    10709, 10711, 10723, 10729, 10733, 10739, 10753, 10771,
+    10781, 10789, 10799, 10831, 10837, 10847, 10853, 10859,
+    10861, 10867, 10883, 10889, 10891, 10903, 10909, 10937,
+    10939, 10949, 10957, 10973, 10979, 10987, 10993, 11003,
+    11027, 11047, 11057, 11059, 11069, 11071, 11083, 11087,
+    11093, 11113, 11117, 11119, 11131, 11149, 11159, 11161,
+    11171, 11173, 11177, 11197, 11213, 11239, 11243, 11251,
+    11257, 11261, 11273, 11279, 11287, 11299, 11311, 11317,
+    11321, 11329, 11351, 11353, 11369, 11383, 11393, 11399,
+    11411, 11423, 11437, 11443, 11447, 11467, 11471, 11483,
+    11489, 11491, 11497, 11503, 11519, 11527, 11549, 11551,
+    11579, 11587, 11593, 11597, 11617, 11621, 11633, 11657,
+    11677, 11681, 11689, 11699, 11701, 11717, 11719, 11731,
+    11743, 11777, 11779, 11783, 11789, 11801, 11807, 11813,
+    11821, 11827, 11831, 11833, 11839, 11863, 11867, 11887,
+    11897, 11903, 11909, 11923, 11927, 11933, 11939, 11941,
+    11953, 11959, 11969, 11971, 11981, 11987, 12007, 12011,
+    12037, 12041, 12043, 12049, 12071, 12073, 12097, 12101,
+    12107, 12109, 12113, 12119, 12143, 12149, 12157, 12161,
+    12163, 12197, 12203, 12211, 12227, 12239, 12241, 12251,
+    12253, 12263, 12269, 12277, 12281, 12289, 12301, 12323,
+    12329, 12343, 12347, 12373, 12377, 12379, 12391, 12401,
+    12409, 12413, 12421, 12433, 12437, 12451, 12457, 12473,
+    12479, 12487, 12491, 12497, 12503, 12511, 12517, 12527,
+    12539, 12541, 12547, 12553, 12569, 12577, 12583, 12589,
+    12601, 12611, 12613, 12619, 12637, 12641, 12647, 12653,
+    12659, 12671, 12689, 12697, 12703, 12713, 12721, 12739,
+    12743, 12757, 12763, 12781, 12791, 12799, 12809, 12821,
+    12823, 12829, 12841, 12853, 12889, 12893, 12899, 12907,
+    12911, 12917, 12919, 12923, 12941, 12953, 12959, 12967,
+    12973, 12979, 12983, 13001, 13003, 13007, 13009, 13033,
+    13037, 13043, 13049, 13063, 13093, 13099, 13103, 13109,
+    13121, 13127, 13147, 13151, 13159, 13163, 13171, 13177,
+    13183, 13187, 13217, 13219, 13229, 13241, 13249, 13259,
+    13267, 13291, 13297, 13309, 13313, 13327, 13331, 13337,
+    13339, 13367, 13381, 13397, 13399, 13411, 13417, 13421,
+    13441, 13451, 13457, 13463, 13469, 13477, 13487, 13499,
+    13513, 13523, 13537, 13553, 13567, 13577, 13591, 13597,
+    13613, 13619, 13627, 13633, 13649, 13669, 13679, 13681,
+    13687, 13691, 13693, 13697, 13709, 13711, 13721, 13723,
+    13729, 13751, 13757, 13759, 13763, 13781, 13789, 13799,
+    13807, 13829, 13831, 13841, 13859, 13873, 13877, 13879,
+    13883, 13901, 13903, 13907, 13913, 13921, 13931, 13933,
+    13963, 13967, 13997, 13999, 14009, 14011, 14029, 14033,
+    14051, 14057, 14071, 14081, 14083, 14087, 14107, 14143,
+    14149, 14153, 14159, 14173, 14177, 14197, 14207, 14221,
+    14243, 14249, 14251, 14281, 14293, 14303, 14321, 14323,
+    14327, 14341, 14347, 14369, 14387, 14389, 14401, 14407,
+    14411, 14419, 14423, 14431, 14437, 14447, 14449, 14461,
+    14479, 14489, 14503, 14519, 14533, 14537, 14543, 14549,
+    14551, 14557, 14561, 14563, 14591, 14593, 14621, 14627,
+    14629, 14633, 14639, 14653, 14657, 14669, 14683, 14699,
+    14713, 14717, 14723, 14731, 14737, 14741, 14747, 14753,
+    14759, 14767, 14771, 14779, 14783, 14797, 14813, 14821,
+    14827, 14831, 14843, 14851, 14867, 14869, 14879, 14887,
+    14891, 14897, 14923, 14929, 14939, 14947, 14951, 14957,
+    14969, 14983, 15013, 15017, 15031, 15053, 15061, 15073,
+    15077, 15083, 15091, 15101, 15107, 15121, 15131, 15137,
+    15139, 15149, 15161, 15173, 15187, 15193, 15199, 15217,
+    15227, 15233, 15241, 15259, 15263, 15269, 15271, 15277,
+    15287, 15289, 15299, 15307, 15313, 15319, 15329, 15331,
+    15349, 15359, 15361, 15373, 15377, 15383, 15391, 15401,
+    15413, 15427, 15439, 15443, 15451, 15461, 15467, 15473,
+    15493, 15497, 15511, 15527, 15541, 15551, 15559, 15569,
+    15581, 15583, 15601, 15607, 15619, 15629, 15641, 15643,
+    15647, 15649, 15661, 15667, 15671, 15679, 15683, 15727,
+    15731, 15733, 15737, 15739, 15749, 15761, 15767, 15773,
+    15787, 15791, 15797, 15803, 15809, 15817, 15823, 15859,
+    15877, 15881, 15887, 15889, 15901, 15907, 15913, 15919,
+    15923, 15937, 15959, 15971, 15973, 15991, 16001, 16007,
+    16033, 16057, 16061, 16063, 16067, 16069, 16073, 16087,
+    16091, 16097, 16103, 16111, 16127, 16139, 16141, 16183,
+    16187, 16189, 16193, 16217, 16223, 16229, 16231, 16249,
+    16253, 16267, 16273, 16301, 16319, 16333, 16339, 16349,
+    16361, 16363, 16369, 16381, 16411, 16417, 16421, 16427,
+    16433, 16447, 16451, 16453, 16477, 16481, 16487, 16493,
+    16519, 16529, 16547, 16553, 16561, 16567, 16573, 16603,
+    16607, 16619, 16631, 16633, 16649, 16651, 16657, 16661,
+    16673, 16691, 16693, 16699, 16703, 16729, 16741, 16747,
+    16759, 16763, 16787, 16811, 16823, 16829, 16831, 16843,
+    16871, 16879, 16883, 16889, 16901, 16903, 16921, 16927,
+    16931, 16937, 16943, 16963, 16979, 16981, 16987, 16993,
+    17011, 17021, 17027, 17029, 17033, 17041, 17047, 17053,
+    17077, 17093, 17099, 17107, 17117, 17123, 17137, 17159,
+    17167, 17183, 17189, 17191, 17203, 17207, 17209, 17231,
+    17239, 17257, 17291, 17293, 17299, 17317, 17321, 17327,
+    17333, 17341, 17351, 17359, 17377, 17383, 17387, 17389,
+    17393, 17401, 17417, 17419, 17431, 17443, 17449, 17467,
+    17471, 17477, 17483, 17489, 17491, 17497, 17509, 17519,
+    17539, 17551, 17569, 17573, 17579, 17581, 17597, 17599,
+    17609, 17623, 17627, 17657, 17659, 17669, 17681, 17683,
+    17707, 17713, 17729, 17737, 17747, 17749, 17761, 17783,
+    17789, 17791, 17807, 17827, 17837, 17839, 17851, 17863,
+#endif
+};
--- a/openssl-1.0.2f/crypto/bn/bn_prime.o
+++ b/openssl-1.0.2f/crypto/bn/bn_prime.o
--- a/openssl-1.0.2f/crypto/bn/bn_prime.pl
+++ b/openssl-1.0.2f/crypto/bn/bn_prime.pl
@@ -0,0 +1,119 @@
+#!/usr/local/bin/perl
+# bn_prime.pl
+
+$num=2048;
+$num=$ARGV[0] if ($#ARGV >= 0);
+
+push(@primes,2);
+$p=1;
+loop: while ($#primes < $num-1)
+	{
+	$p+=2;
+	$s=int(sqrt($p));
+
+	for ($i=0; defined($primes[$i]) && $primes[$i]<=$s; $i++)
+		{
+		next loop if (($p%$primes[$i]) == 0);
+		}
+	push(@primes,$p);
+	}
+
+# print <<"EOF";
+# /* Auto generated by bn_prime.pl */
+# /* Copyright (C) 1995-1997 Eric Young (eay\@mincom.oz.au).
+#  * All rights reserved.
+#  * Copyright remains Eric Young's, and as such any Copyright notices in
+#  * the code are not to be removed.
+#  * See the COPYRIGHT file in the SSLeay distribution for more details.
+#  */
+# 
+# EOF
+
+print <<\EOF;
+/* Auto generated by bn_prime.pl */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+EOF
+
+for ($i=0; $i <= $#primes; $i++)
+	{
+	if ($primes[$i] > 256)
+		{
+		$eight=$i;
+		last;
+		}
+	}
+
+printf "#ifndef EIGHT_BIT\n";
+printf "#define NUMPRIMES %d\n",$num;
+printf "typedef unsigned short prime_t;\n";
+printf "#else\n";
+printf "#define NUMPRIMES %d\n",$eight;
+printf "typedef unsigned char prime_t;\n";
+printf "#endif\n";
+print "static const prime_t primes[NUMPRIMES]=\n\t{\n\t";
+$init=0;
+for ($i=0; $i <= $#primes; $i++)
+	{
+	printf "\n#ifndef EIGHT_BIT\n\t" if ($primes[$i] > 256) && !($init++);
+	printf("\n\t") if (($i%8) == 0) && ($i != 0);
+	printf("%4d,",$primes[$i]);
+	}
+print "\n#endif\n\t};\n";
+
+
--- a/openssl-1.0.2f/crypto/bn/bn_print.c
+++ b/openssl-1.0.2f/crypto/bn/bn_print.c
@@ -0,0 +1,388 @@
+/* crypto/bn/bn_print.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include "cryptlib.h"
+#include <openssl/buffer.h>
+#include "bn_lcl.h"
+
+static const char Hex[] = "0123456789ABCDEF";
+
+/* Must 'OPENSSL_free' the returned data */
+char *BN_bn2hex(const BIGNUM *a)
+{
+    int i, j, v, z = 0;
+    char *buf;
+    char *p;
+
+    if (a->neg && BN_is_zero(a)) {
+        /* "-0" == 3 bytes including NULL terminator */
+        buf = OPENSSL_malloc(3);
+    } else {
+        buf = OPENSSL_malloc(a->top * BN_BYTES * 2 + 2);
+    }
+    if (buf == NULL) {
+        BNerr(BN_F_BN_BN2HEX, ERR_R_MALLOC_FAILURE);
+        goto err;
+    }
+    p = buf;
+    if (a->neg)
+        *(p++) = '-';
+    if (BN_is_zero(a))
+        *(p++) = '0';
+    for (i = a->top - 1; i >= 0; i--) {
+        for (j = BN_BITS2 - 8; j >= 0; j -= 8) {
+            /* strip leading zeros */
+            v = ((int)(a->d[i] >> (long)j)) & 0xff;
+            if (z || (v != 0)) {
+                *(p++) = Hex[v >> 4];
+                *(p++) = Hex[v & 0x0f];
+                z = 1;
+            }
+        }
+    }
+    *p = '\0';
+ err:
+    return (buf);
+}
+
+/* Must 'OPENSSL_free' the returned data */
+char *BN_bn2dec(const BIGNUM *a)
+{
+    int i = 0, num, ok = 0;
+    char *buf = NULL;
+    char *p;
+    BIGNUM *t = NULL;
+    BN_ULONG *bn_data = NULL, *lp;
+
+    /*-
+     * get an upper bound for the length of the decimal integer
+     * num <= (BN_num_bits(a) + 1) * log(2)
+     *     <= 3 * BN_num_bits(a) * 0.1001 + log(2) + 1     (rounding error)
+     *     <= BN_num_bits(a)/10 + BN_num_bits/1000 + 1 + 1
+     */
+    i = BN_num_bits(a) * 3;
+    num = (i / 10 + i / 1000 + 1) + 1;
+    bn_data =
+        (BN_ULONG *)OPENSSL_malloc((num / BN_DEC_NUM + 1) * sizeof(BN_ULONG));
+    buf = (char *)OPENSSL_malloc(num + 3);
+    if ((buf == NULL) || (bn_data == NULL)) {
+        BNerr(BN_F_BN_BN2DEC, ERR_R_MALLOC_FAILURE);
+        goto err;
+    }
+    if ((t = BN_dup(a)) == NULL)
+        goto err;
+
+#define BUF_REMAIN (num+3 - (size_t)(p - buf))
+    p = buf;
+    lp = bn_data;
+    if (BN_is_zero(t)) {
+        *(p++) = '0';
+        *(p++) = '\0';
+    } else {
+        if (BN_is_negative(t))
+            *p++ = '-';
+
+        i = 0;
+        while (!BN_is_zero(t)) {
+            *lp = BN_div_word(t, BN_DEC_CONV);
+            lp++;
+        }
+        lp--;
+        /*
+         * We now have a series of blocks, BN_DEC_NUM chars in length, where
+         * the last one needs truncation. The blocks need to be reversed in
+         * order.
+         */
+        BIO_snprintf(p, BUF_REMAIN, BN_DEC_FMT1, *lp);
+        while (*p)
+            p++;
+        while (lp != bn_data) {
+            lp--;
+            BIO_snprintf(p, BUF_REMAIN, BN_DEC_FMT2, *lp);
+            while (*p)
+                p++;
+        }
+    }
+    ok = 1;
+ err:
+    if (bn_data != NULL)
+        OPENSSL_free(bn_data);
+    if (t != NULL)
+        BN_free(t);
+    if (!ok && buf) {
+        OPENSSL_free(buf);
+        buf = NULL;
+    }
+
+    return (buf);
+}
+
+int BN_hex2bn(BIGNUM **bn, const char *a)
+{
+    BIGNUM *ret = NULL;
+    BN_ULONG l = 0;
+    int neg = 0, h, m, i, j, k, c;
+    int num;
+
+    if ((a == NULL) || (*a == '\0'))
+        return (0);
+
+    if (*a == '-') {
+        neg = 1;
+        a++;
+    }
+
+    for (i = 0; isxdigit((unsigned char)a[i]); i++) ;
+
+    num = i + neg;
+    if (bn == NULL)
+        return (num);
+
+    /* a is the start of the hex digits, and it is 'i' long */
+    if (*bn == NULL) {
+        if ((ret = BN_new()) == NULL)
+            return (0);
+    } else {
+        ret = *bn;
+        BN_zero(ret);
+    }
+
+    /* i is the number of hex digests; */
+    if (bn_expand(ret, i * 4) == NULL)
+        goto err;
+
+    j = i;                      /* least significant 'hex' */
+    m = 0;
+    h = 0;
+    while (j > 0) {
+        m = ((BN_BYTES * 2) <= j) ? (BN_BYTES * 2) : j;
+        l = 0;
+        for (;;) {
+            c = a[j - m];
+            if ((c >= '0') && (c <= '9'))
+                k = c - '0';
+            else if ((c >= 'a') && (c <= 'f'))
+                k = c - 'a' + 10;
+            else if ((c >= 'A') && (c <= 'F'))
+                k = c - 'A' + 10;
+            else
+                k = 0;          /* paranoia */
+            l = (l << 4) | k;
+
+            if (--m <= 0) {
+                ret->d[h++] = l;
+                break;
+            }
+        }
+        j -= (BN_BYTES * 2);
+    }
+    ret->top = h;
+    bn_correct_top(ret);
+    ret->neg = neg;
+
+    *bn = ret;
+    bn_check_top(ret);
+    return (num);
+ err:
+    if (*bn == NULL)
+        BN_free(ret);
+    return (0);
+}
+
+int BN_dec2bn(BIGNUM **bn, const char *a)
+{
+    BIGNUM *ret = NULL;
+    BN_ULONG l = 0;
+    int neg = 0, i, j;
+    int num;
+
+    if ((a == NULL) || (*a == '\0'))
+        return (0);
+    if (*a == '-') {
+        neg = 1;
+        a++;
+    }
+
+    for (i = 0; isdigit((unsigned char)a[i]); i++) ;
+
+    num = i + neg;
+    if (bn == NULL)
+        return (num);
+
+    /*
+     * a is the start of the digits, and it is 'i' long. We chop it into
+     * BN_DEC_NUM digits at a time
+     */
+    if (*bn == NULL) {
+        if ((ret = BN_new()) == NULL)
+            return (0);
+    } else {
+        ret = *bn;
+        BN_zero(ret);
+    }
+
+    /* i is the number of digests, a bit of an over expand; */
+    if (bn_expand(ret, i * 4) == NULL)
+        goto err;
+
+    j = BN_DEC_NUM - (i % BN_DEC_NUM);
+    if (j == BN_DEC_NUM)
+        j = 0;
+    l = 0;
+    while (*a) {
+        l *= 10;
+        l += *a - '0';
+        a++;
+        if (++j == BN_DEC_NUM) {
+            BN_mul_word(ret, BN_DEC_CONV);
+            BN_add_word(ret, l);
+            l = 0;
+            j = 0;
+        }
+    }
+    ret->neg = neg;
+
+    bn_correct_top(ret);
+    *bn = ret;
+    bn_check_top(ret);
+    return (num);
+ err:
+    if (*bn == NULL)
+        BN_free(ret);
+    return (0);
+}
+
+int BN_asc2bn(BIGNUM **bn, const char *a)
+{
+    const char *p = a;
+    if (*p == '-')
+        p++;
+
+    if (p[0] == '0' && (p[1] == 'X' || p[1] == 'x')) {
+        if (!BN_hex2bn(bn, p + 2))
+            return 0;
+    } else {
+        if (!BN_dec2bn(bn, p))
+            return 0;
+    }
+    if (*a == '-')
+        (*bn)->neg = 1;
+    return 1;
+}
+
+#ifndef OPENSSL_NO_BIO
+# ifndef OPENSSL_NO_FP_API
+int BN_print_fp(FILE *fp, const BIGNUM *a)
+{
+    BIO *b;
+    int ret;
+
+    if ((b = BIO_new(BIO_s_file())) == NULL)
+        return (0);
+    BIO_set_fp(b, fp, BIO_NOCLOSE);
+    ret = BN_print(b, a);
+    BIO_free(b);
+    return (ret);
+}
+# endif
+
+int BN_print(BIO *bp, const BIGNUM *a)
+{
+    int i, j, v, z = 0;
+    int ret = 0;
+
+    if ((a->neg) && (BIO_write(bp, "-", 1) != 1))
+        goto end;
+    if (BN_is_zero(a) && (BIO_write(bp, "0", 1) != 1))
+        goto end;
+    for (i = a->top - 1; i >= 0; i--) {
+        for (j = BN_BITS2 - 4; j >= 0; j -= 4) {
+            /* strip leading zeros */
+            v = ((int)(a->d[i] >> (long)j)) & 0x0f;
+            if (z || (v != 0)) {
+                if (BIO_write(bp, &(Hex[v]), 1) != 1)
+                    goto end;
+                z = 1;
+            }
+        }
+    }
+    ret = 1;
+ end:
+    return (ret);
+}
+#endif
+
+char *BN_options(void)
+{
+    static int init = 0;
+    static char data[16];
+
+    if (!init) {
+        init++;
+#ifdef BN_LLONG
+        BIO_snprintf(data, sizeof data, "bn(%d,%d)",
+                     (int)sizeof(BN_ULLONG) * 8, (int)sizeof(BN_ULONG) * 8);
+#else
+        BIO_snprintf(data, sizeof data, "bn(%d,%d)",
+                     (int)sizeof(BN_ULONG) * 8, (int)sizeof(BN_ULONG) * 8);
+#endif
+    }
+    return (data);
+}
--- a/openssl-1.0.2f/crypto/bn/bn_print.o
+++ b/openssl-1.0.2f/crypto/bn/bn_print.o
--- a/openssl-1.0.2f/crypto/bn/bn_rand.c
+++ b/openssl-1.0.2f/crypto/bn/bn_rand.c
@@ -0,0 +1,295 @@
+/* crypto/bn/bn_rand.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+#include <openssl/rand.h>
+
+static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
+{
+    unsigned char *buf = NULL;
+    int ret = 0, bit, bytes, mask;
+    time_t tim;
+
+    if (bits < 0 || (bits == 1 && top > 0)) {
+        BNerr(BN_F_BNRAND, BN_R_BITS_TOO_SMALL);
+        return 0;
+    }
+
+    if (bits == 0) {
+        BN_zero(rnd);
+        return 1;
+    }
+
+    bytes = (bits + 7) / 8;
+    bit = (bits - 1) % 8;
+    mask = 0xff << (bit + 1);
+
+    buf = (unsigned char *)OPENSSL_malloc(bytes);
+    if (buf == NULL) {
+        BNerr(BN_F_BNRAND, ERR_R_MALLOC_FAILURE);
+        goto err;
+    }
+
+    /* make a random number and set the top and bottom bits */
+    time(&tim);
+    RAND_add(&tim, sizeof(tim), 0.0);
+
+    if (pseudorand) {
+        if (RAND_pseudo_bytes(buf, bytes) == -1)
+            goto err;
+    } else {
+        if (RAND_bytes(buf, bytes) <= 0)
+            goto err;
+    }
+
+#if 1
+    if (pseudorand == 2) {
+        /*
+         * generate patterns that are more likely to trigger BN library bugs
+         */
+        int i;
+        unsigned char c;
+
+        for (i = 0; i < bytes; i++) {
+            if (RAND_pseudo_bytes(&c, 1) < 0)
+                goto err;
+            if (c >= 128 && i > 0)
+                buf[i] = buf[i - 1];
+            else if (c < 42)
+                buf[i] = 0;
+            else if (c < 84)
+                buf[i] = 255;
+        }
+    }
+#endif
+
+    if (top >= 0) {
+        if (top) {
+            if (bit == 0) {
+                buf[0] = 1;
+                buf[1] |= 0x80;
+            } else {
+                buf[0] |= (3 << (bit - 1));
+            }
+        } else {
+            buf[0] |= (1 << bit);
+        }
+    }
+    buf[0] &= ~mask;
+    if (bottom)                 /* set bottom bit if requested */
+        buf[bytes - 1] |= 1;
+    if (!BN_bin2bn(buf, bytes, rnd))
+        goto err;
+    ret = 1;
+ err:
+    if (buf != NULL) {
+        OPENSSL_cleanse(buf, bytes);
+        OPENSSL_free(buf);
+    }
+    bn_check_top(rnd);
+    return (ret);
+}
+
+int BN_rand(BIGNUM *rnd, int bits, int top, int bottom)
+{
+    return bnrand(0, rnd, bits, top, bottom);
+}
+
+int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom)
+{
+    return bnrand(1, rnd, bits, top, bottom);
+}
+
+#if 1
+int BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom)
+{
+    return bnrand(2, rnd, bits, top, bottom);
+}
+#endif
+
+/* random number r:  0 <= r < range */
+static int bn_rand_range(int pseudo, BIGNUM *r, const BIGNUM *range)
+{
+    int (*bn_rand) (BIGNUM *, int, int, int) =
+        pseudo ? BN_pseudo_rand : BN_rand;
+    int n;
+    int count = 100;
+
+    if (range->neg || BN_is_zero(range)) {
+        BNerr(BN_F_BN_RAND_RANGE, BN_R_INVALID_RANGE);
+        return 0;
+    }
+
+    n = BN_num_bits(range);     /* n > 0 */
+
+    /* BN_is_bit_set(range, n - 1) always holds */
+
+    if (n == 1)
+        BN_zero(r);
+    else if (!BN_is_bit_set(range, n - 2) && !BN_is_bit_set(range, n - 3)) {
+        /*
+         * range = 100..._2, so 3*range (= 11..._2) is exactly one bit longer
+         * than range
+         */
+        do {
+            if (!bn_rand(r, n + 1, -1, 0))
+                return 0;
+            /*
+             * If r < 3*range, use r := r MOD range (which is either r, r -
+             * range, or r - 2*range). Otherwise, iterate once more. Since
+             * 3*range = 11..._2, each iteration succeeds with probability >=
+             * .75.
+             */
+            if (BN_cmp(r, range) >= 0) {
+                if (!BN_sub(r, r, range))
+                    return 0;
+                if (BN_cmp(r, range) >= 0)
+                    if (!BN_sub(r, r, range))
+                        return 0;
+            }
+
+            if (!--count) {
+                BNerr(BN_F_BN_RAND_RANGE, BN_R_TOO_MANY_ITERATIONS);
+                return 0;
+            }
+
+        }
+        while (BN_cmp(r, range) >= 0);
+    } else {
+        do {
+            /* range = 11..._2  or  range = 101..._2 */
+            if (!bn_rand(r, n, -1, 0))
+                return 0;
+
+            if (!--count) {
+                BNerr(BN_F_BN_RAND_RANGE, BN_R_TOO_MANY_ITERATIONS);
+                return 0;
+            }
+        }
+        while (BN_cmp(r, range) >= 0);
+    }
+
+    bn_check_top(r);
+    return 1;
+}
+
+int BN_rand_range(BIGNUM *r, const BIGNUM *range)
+{
+    return bn_rand_range(0, r, range);
+}
+
+int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range)
+{
+    return bn_rand_range(1, r, range);
+}
--- a/openssl-1.0.2f/crypto/bn/bn_rand.o
+++ b/openssl-1.0.2f/crypto/bn/bn_rand.o
--- a/openssl-1.0.2f/crypto/bn/bn_recp.c
+++ b/openssl-1.0.2f/crypto/bn/bn_recp.c
@@ -0,0 +1,251 @@
+/* crypto/bn/bn_recp.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+void BN_RECP_CTX_init(BN_RECP_CTX *recp)
+{
+    BN_init(&(recp->N));
+    BN_init(&(recp->Nr));
+    recp->num_bits = 0;
+    recp->flags = 0;
+}
+
+BN_RECP_CTX *BN_RECP_CTX_new(void)
+{
+    BN_RECP_CTX *ret;
+
+    if ((ret = (BN_RECP_CTX *)OPENSSL_malloc(sizeof(BN_RECP_CTX))) == NULL)
+        return (NULL);
+
+    BN_RECP_CTX_init(ret);
+    ret->flags = BN_FLG_MALLOCED;
+    return (ret);
+}
+
+void BN_RECP_CTX_free(BN_RECP_CTX *recp)
+{
+    if (recp == NULL)
+        return;
+
+    BN_free(&(recp->N));
+    BN_free(&(recp->Nr));
+    if (recp->flags & BN_FLG_MALLOCED)
+        OPENSSL_free(recp);
+}
+
+int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx)
+{
+    if (!BN_copy(&(recp->N), d))
+        return 0;
+    BN_zero(&(recp->Nr));
+    recp->num_bits = BN_num_bits(d);
+    recp->shift = 0;
+    return (1);
+}
+
+int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
+                          BN_RECP_CTX *recp, BN_CTX *ctx)
+{
+    int ret = 0;
+    BIGNUM *a;
+    const BIGNUM *ca;
+
+    BN_CTX_start(ctx);
+    if ((a = BN_CTX_get(ctx)) == NULL)
+        goto err;
+    if (y != NULL) {
+        if (x == y) {
+            if (!BN_sqr(a, x, ctx))
+                goto err;
+        } else {
+            if (!BN_mul(a, x, y, ctx))
+                goto err;
+        }
+        ca = a;
+    } else
+        ca = x;                 /* Just do the mod */
+
+    ret = BN_div_recp(NULL, r, ca, recp, ctx);
+ err:
+    BN_CTX_end(ctx);
+    bn_check_top(r);
+    return (ret);
+}
+
+int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
+                BN_RECP_CTX *recp, BN_CTX *ctx)
+{
+    int i, j, ret = 0;
+    BIGNUM *a, *b, *d, *r;
+
+    BN_CTX_start(ctx);
+    a = BN_CTX_get(ctx);
+    b = BN_CTX_get(ctx);
+    if (dv != NULL)
+        d = dv;
+    else
+        d = BN_CTX_get(ctx);
+    if (rem != NULL)
+        r = rem;
+    else
+        r = BN_CTX_get(ctx);
+    if (a == NULL || b == NULL || d == NULL || r == NULL)
+        goto err;
+
+    if (BN_ucmp(m, &(recp->N)) < 0) {
+        BN_zero(d);
+        if (!BN_copy(r, m)) {
+            BN_CTX_end(ctx);
+            return 0;
+        }
+        BN_CTX_end(ctx);
+        return (1);
+    }
+
+    /*
+     * We want the remainder Given input of ABCDEF / ab we need multiply
+     * ABCDEF by 3 digests of the reciprocal of ab
+     */
+
+    /* i := max(BN_num_bits(m), 2*BN_num_bits(N)) */
+    i = BN_num_bits(m);
+    j = recp->num_bits << 1;
+    if (j > i)
+        i = j;
+
+    /* Nr := round(2^i / N) */
+    if (i != recp->shift)
+        recp->shift = BN_reciprocal(&(recp->Nr), &(recp->N), i, ctx);
+    /* BN_reciprocal could have returned -1 for an error */
+    if (recp->shift == -1)
+        goto err;
+
+    /*-
+     * d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - BN_num_bits(N)))|
+     *    = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - BN_num_bits(N)))|
+     *   <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)|
+     *    = |m/N|
+     */
+    if (!BN_rshift(a, m, recp->num_bits))
+        goto err;
+    if (!BN_mul(b, a, &(recp->Nr), ctx))
+        goto err;
+    if (!BN_rshift(d, b, i - recp->num_bits))
+        goto err;
+    d->neg = 0;
+
+    if (!BN_mul(b, &(recp->N), d, ctx))
+        goto err;
+    if (!BN_usub(r, m, b))
+        goto err;
+    r->neg = 0;
+
+#if 1
+    j = 0;
+    while (BN_ucmp(r, &(recp->N)) >= 0) {
+        if (j++ > 2) {
+            BNerr(BN_F_BN_DIV_RECP, BN_R_BAD_RECIPROCAL);
+            goto err;
+        }
+        if (!BN_usub(r, r, &(recp->N)))
+            goto err;
+        if (!BN_add_word(d, 1))
+            goto err;
+    }
+#endif
+
+    r->neg = BN_is_zero(r) ? 0 : m->neg;
+    d->neg = m->neg ^ recp->N.neg;
+    ret = 1;
+ err:
+    BN_CTX_end(ctx);
+    bn_check_top(dv);
+    bn_check_top(rem);
+    return (ret);
+}
+
+/*
+ * len is the expected size of the result We actually calculate with an extra
+ * word of precision, so we can do faster division if the remainder is not
+ * required.
+ */
+/* r := 2^len / m */
+int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx)
+{
+    int ret = -1;
+    BIGNUM *t;
+
+    BN_CTX_start(ctx);
+    if ((t = BN_CTX_get(ctx)) == NULL)
+        goto err;
+
+    if (!BN_set_bit(t, len))
+        goto err;
+
+    if (!BN_div(r, NULL, t, m, ctx))
+        goto err;
+
+    ret = len;
+ err:
+    bn_check_top(r);
+    BN_CTX_end(ctx);
+    return (ret);
+}
--- a/openssl-1.0.2f/crypto/bn/bn_recp.o
+++ b/openssl-1.0.2f/crypto/bn/bn_recp.o
--- a/openssl-1.0.2f/crypto/bn/bn_shift.c
+++ b/openssl-1.0.2f/crypto/bn/bn_shift.c
@@ -0,0 +1,224 @@
+/* crypto/bn/bn_shift.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include "bn_lcl.h"
+
+int BN_lshift1(BIGNUM *r, const BIGNUM *a)
+{
+    register BN_ULONG *ap, *rp, t, c;
+    int i;
+
+    bn_check_top(r);
+    bn_check_top(a);
+
+    if (r != a) {
+        r->neg = a->neg;
+        if (bn_wexpand(r, a->top + 1) == NULL)
+            return (0);
+        r->top = a->top;
+    } else {
+        if (bn_wexpand(r, a->top + 1) == NULL)
+            return (0);
+    }
+    ap = a->d;
+    rp = r->d;
+    c = 0;
+    for (i = 0; i < a->top; i++) {
+        t = *(ap++);
+        *(rp++) = ((t << 1) | c) & BN_MASK2;
+        c = (t & BN_TBIT) ? 1 : 0;
+    }
+    if (c) {
+        *rp = 1;
+        r->top++;
+    }
+    bn_check_top(r);
+    return (1);
+}
+
+int BN_rshift1(BIGNUM *r, const BIGNUM *a)
+{
+    BN_ULONG *ap, *rp, t, c;
+    int i, j;
+
+    bn_check_top(r);
+    bn_check_top(a);
+
+    if (BN_is_zero(a)) {
+        BN_zero(r);
+        return (1);
+    }
+    i = a->top;
+    ap = a->d;
+    j = i - (ap[i - 1] == 1);
+    if (a != r) {
+        if (bn_wexpand(r, j) == NULL)
+            return (0);
+        r->neg = a->neg;
+    }
+    rp = r->d;
+    t = ap[--i];
+    c = (t & 1) ? BN_TBIT : 0;
+    if (t >>= 1)
+        rp[i] = t;
+    while (i > 0) {
+        t = ap[--i];
+        rp[i] = ((t >> 1) & BN_MASK2) | c;
+        c = (t & 1) ? BN_TBIT : 0;
+    }
+    r->top = j;
+    bn_check_top(r);
+    return (1);
+}
+
+int BN_lshift(BIGNUM *r, const BIGNUM *a, int n)
+{
+    int i, nw, lb, rb;
+    BN_ULONG *t, *f;
+    BN_ULONG l;
+
+    bn_check_top(r);
+    bn_check_top(a);
+
+    if (n < 0) {
+        BNerr(BN_F_BN_LSHIFT, BN_R_INVALID_SHIFT);
+        return 0;
+    }
+
+    r->neg = a->neg;
+    nw = n / BN_BITS2;
+    if (bn_wexpand(r, a->top + nw + 1) == NULL)
+        return (0);
+    lb = n % BN_BITS2;
+    rb = BN_BITS2 - lb;
+    f = a->d;
+    t = r->d;
+    t[a->top + nw] = 0;
+    if (lb == 0)
+        for (i = a->top - 1; i >= 0; i--)
+            t[nw + i] = f[i];
+    else
+        for (i = a->top - 1; i >= 0; i--) {
+            l = f[i];
+            t[nw + i + 1] |= (l >> rb) & BN_MASK2;
+            t[nw + i] = (l << lb) & BN_MASK2;
+        }
+    memset(t, 0, nw * sizeof(t[0]));
+    /*
+     * for (i=0; i<nw; i++) t[i]=0;
+     */
+    r->top = a->top + nw + 1;
+    bn_correct_top(r);
+    bn_check_top(r);
+    return (1);
+}
+
+int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
+{
+    int i, j, nw, lb, rb;
+    BN_ULONG *t, *f;
+    BN_ULONG l, tmp;
+
+    bn_check_top(r);
+    bn_check_top(a);
+
+    if (n < 0) {
+        BNerr(BN_F_BN_RSHIFT, BN_R_INVALID_SHIFT);
+        return 0;
+    }
+
+    nw = n / BN_BITS2;
+    rb = n % BN_BITS2;
+    lb = BN_BITS2 - rb;
+    if (nw >= a->top || a->top == 0) {
+        BN_zero(r);
+        return (1);
+    }
+    i = (BN_num_bits(a) - n + (BN_BITS2 - 1)) / BN_BITS2;
+    if (r != a) {
+        r->neg = a->neg;
+        if (bn_wexpand(r, i) == NULL)
+            return (0);
+    } else {
+        if (n == 0)
+            return 1;           /* or the copying loop will go berserk */
+    }
+
+    f = &(a->d[nw]);
+    t = r->d;
+    j = a->top - nw;
+    r->top = i;
+
+    if (rb == 0) {
+        for (i = j; i != 0; i--)
+            *(t++) = *(f++);
+    } else {
+        l = *(f++);
+        for (i = j - 1; i != 0; i--) {
+            tmp = (l >> rb) & BN_MASK2;
+            l = *(f++);
+            *(t++) = (tmp | (l << lb)) & BN_MASK2;
+        }
+        if ((l = (l >> rb) & BN_MASK2))
+            *(t) = l;
+    }
+    bn_check_top(r);
+    return (1);
+}
--- a/openssl-1.0.2f/crypto/bn/bn_shift.o
+++ b/openssl-1.0.2f/crypto/bn/bn_shift.o
--- a/Show More
+++ b/Show More