From 8d1e735041ed8f5b5d83eead59786e96521ef79a Mon Sep 17 00:00:00 2001
From: Linus Yang <laokongzi@gmail.com>
Date: Fri, 18 Aug 2017 21:16:49 +0800
Subject: [PATCH 1/2] Add mips big-endian asm

---
 lib/aes_acc/asm/mips.S    |   12 +-
 lib/aes_acc/asm/mips_be.S | 1831 +++++++++++++++++++++++++++++++++++++
 2 files changed, 1835 insertions(+), 8 deletions(-)
 create mode 100644 lib/aes_acc/asm/mips_be.S

diff --git a/lib/aes_acc/asm/mips.S b/lib/aes_acc/asm/mips.S
index 4aa0e4c..97a12b9 100644
--- a/lib/aes_acc/asm/mips.S
+++ b/lib/aes_acc/asm/mips.S
@@ -507,6 +507,7 @@ AES_encrypt:
 	.frame	$29,64,$31
 	.mask	0xc0ff0000,-4
 	.set	noreorder
+	.cpload	$25
 	sub $29,64
 	sw	$31,64-1*4($29)
 	sw	$30,64-2*4($29)
@@ -518,8 +519,6 @@ AES_encrypt:
 	sw	$18,64-8*4($29)
 	sw	$17,64-9*4($29)
 	sw	$16,64-10*4($29)
-	.cplocal	$7
-	.cpsetup	$25,$0,AES_encrypt
 	.set	reorder
 	la	$7,AES_Te		# PIC-ified 'load address'
 
@@ -1050,6 +1049,7 @@ AES_decrypt:
 	.frame	$29,64,$31
 	.mask	0xc0ff0000,-4
 	.set	noreorder
+	.cpload	$25
 	sub $29,64
 	sw	$31,64-1*4($29)
 	sw	$30,64-2*4($29)
@@ -1061,8 +1061,6 @@ AES_decrypt:
 	sw	$18,64-8*4($29)
 	sw	$17,64-9*4($29)
 	sw	$16,64-10*4($29)
-	.cplocal	$7
-	.cpsetup	$25,$0,AES_decrypt
 	.set	reorder
 	la	$7,AES_Td		# PIC-ified 'load address'
 
@@ -1359,11 +1357,10 @@ AES_set_encrypt_key:
 	.frame	$29,32,$31
 	.mask	0xc0000000,-4
 	.set	noreorder
+	.cpload	$25
 	sub $29,32
 	sw	$31,32-1*4($29)
 	sw	$30,32-2*4($29)
-	.cplocal	$7
-	.cpsetup	$25,$0,AES_set_encrypt_key
 	.set	reorder
 	la	$7,AES_Te4		# PIC-ified 'load address'
 
@@ -1383,11 +1380,10 @@ AES_set_decrypt_key:
 	.frame	$29,32,$31
 	.mask	0xc0000000,-4
 	.set	noreorder
+	.cpload	$25
 	sub $29,32
 	sw	$31,32-1*4($29)
 	sw	$30,32-2*4($29)
-	.cplocal	$7
-	.cpsetup	$25,$0,AES_set_decrypt_key
 	.set	reorder
 	la	$7,AES_Te4		# PIC-ified 'load address'
 
diff --git a/lib/aes_acc/asm/mips_be.S b/lib/aes_acc/asm/mips_be.S
new file mode 100644
index 0000000..6fcab5a
--- /dev/null
+++ b/lib/aes_acc/asm/mips_be.S
@@ -0,0 +1,1831 @@
+.text
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2)
+#define _MIPS_ARCH_MIPS32R2
+#endif
+
+#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
+.option	pic2
+#endif
+.set	noat
+.align	5
+.ent	_mips_AES_encrypt
+_mips_AES_encrypt:
+	.frame	$29,0,$31
+	.set	reorder
+	lw	$12,0($6)
+	lw	$13,4($6)
+	lw	$14,8($6)
+	lw	$15,12($6)
+	lw	$30,240($6)
+	add $3,$6,16
+
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+
+	sub	$30,1
+#if defined(__mips_smartmips)
+	ext	$1,$9,16,8
+.Loop_enc:
+	ext	$2,$10,16,8
+	ext	$24,$11,16,8
+	ext	$25,$8,16,8
+	lwxs	$12,$1($7)		# Te1[s1>>16]
+	ext	$1,$10,8,8
+	lwxs	$13,$2($7)		# Te1[s2>>16]
+	ext	$2,$11,8,8
+	lwxs	$14,$24($7)		# Te1[s3>>16]
+	ext	$24,$8,8,8
+	lwxs	$15,$25($7)		# Te1[s0>>16]
+	ext	$25,$9,8,8
+
+	lwxs	$16,$1($7)		# Te2[s2>>8]
+	ext	$1,$11,0,8
+	lwxs	$17,$2($7)		# Te2[s3>>8]
+	ext	$2,$8,0,8
+	lwxs	$18,$24($7)		# Te2[s0>>8]
+	ext	$24,$9,0,8
+	lwxs	$19,$25($7)		# Te2[s1>>8]
+	ext	$25,$10,0,8
+
+	lwxs	$20,$1($7)		# Te3[s3]
+	ext	$1,$8,24,8
+	lwxs	$21,$2($7)		# Te3[s0]
+	ext	$2,$9,24,8
+	lwxs	$22,$24($7)		# Te3[s1]
+	ext	$24,$10,24,8
+	lwxs	$23,$25($7)		# Te3[s2]
+	ext	$25,$11,24,8
+
+	rotr	$12,$12,8
+	rotr	$13,$13,8
+	rotr	$14,$14,8
+	rotr	$15,$15,8
+
+	rotr	$16,$16,16
+	rotr	$17,$17,16
+	rotr	$18,$18,16
+	rotr	$19,$19,16
+
+	xor	$12,$16
+	lwxs	$16,$1($7)		# Te0[s0>>24]
+	xor	$13,$17
+	lwxs	$17,$2($7)		# Te0[s1>>24]
+	xor	$14,$18
+	lwxs	$18,$24($7)		# Te0[s2>>24]
+	xor	$15,$19
+	lwxs	$19,$25($7)		# Te0[s3>>24]
+
+	rotr	$20,$20,24
+	lw	$8,0($3)
+	rotr	$21,$21,24
+	lw	$9,4($3)
+	rotr	$22,$22,24
+	lw	$10,8($3)
+	rotr	$23,$23,24
+	lw	$11,12($3)
+
+	xor	$12,$20
+	xor	$13,$21
+	xor	$14,$22
+	xor	$15,$23
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+	sub	$30,1
+	add $3,16
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+	.set	noreorder
+	bnez	$30,.Loop_enc
+	ext	$1,$9,16,8
+
+	srl	$1,$9,14
+#else
+	srl	$1,$9,14
+.Loop_enc:
+	srl	$2,$10,14
+	srl	$24,$11,14
+	srl	$25,$8,14
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+	lw	$12,0($1)		# Te1[s1>>16]
+	srl	$1,$10,6
+	lw	$13,0($2)		# Te1[s2>>16]
+	srl	$2,$11,6
+	lw	$14,0($24)		# Te1[s3>>16]
+	srl	$24,$8,6
+	lw	$15,0($25)		# Te1[s0>>16]
+	srl	$25,$9,6
+#else
+	lwl	$12,3($1)		# Te1[s1>>16]
+	lwl	$13,3($2)		# Te1[s2>>16]
+	lwl	$14,3($24)		# Te1[s3>>16]
+	lwl	$15,3($25)		# Te1[s0>>16]
+	lwr	$12,2($1)		# Te1[s1>>16]
+	srl	$1,$10,6
+	lwr	$13,2($2)		# Te1[s2>>16]
+	srl	$2,$11,6
+	lwr	$14,2($24)		# Te1[s3>>16]
+	srl	$24,$8,6
+	lwr	$15,2($25)		# Te1[s0>>16]
+	srl	$25,$9,6
+#endif
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+	rotr	$12,$12,8
+	rotr	$13,$13,8
+	rotr	$14,$14,8
+	rotr	$15,$15,8
+# if defined(_MIPSEL)
+	lw	$16,0($1)		# Te2[s2>>8]
+	sll	$1,$11,2
+	lw	$17,0($2)		# Te2[s3>>8]
+	sll	$2,$8,2
+	lw	$18,0($24)		# Te2[s0>>8]
+	sll	$24,$9,2
+	lw	$19,0($25)		# Te2[s1>>8]
+	sll	$25,$10,2
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lw	$20,0($1)		# Te3[s3]
+	ins $1,$8,2,8
+	lw	$21,0($2)		# Te3[s0]
+	ins $2,$9,2,8
+	lw	$22,0($24)		# Te3[s1]
+	ins $24,$10,2,8
+	lw	$23,0($25)		# Te3[s2]
+	ins $25,$11,2,8
+# else
+	lw	$16,0($1)		# Te2[s2>>8]
+	ins $1,$11,2,8
+	lw	$17,0($2)		# Te2[s3>>8]
+	ins $2,$8,2,8
+	lw	$18,0($24)		# Te2[s0>>8]
+	ins $24,$9,2,8
+	lw	$19,0($25)		# Te2[s1>>8]
+	ins $25,$10,2,8
+
+	lw	$20,0($1)		# Te3[s3]
+	srl	$1,$8,22
+	lw	$21,0($2)		# Te3[s0]
+	srl	$2,$9,22
+	lw	$22,0($24)		# Te3[s1]
+	srl	$24,$10,22
+	lw	$23,0($25)		# Te3[s2]
+	srl	$25,$11,22
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+# endif
+	rotr	$16,$16,16
+	rotr	$17,$17,16
+	rotr	$18,$18,16
+	rotr	$19,$19,16
+
+	rotr	$20,$20,24
+	rotr	$21,$21,24
+	rotr	$22,$22,24
+	rotr	$23,$23,24
+#else
+	lwl	$16,2($1)		# Te2[s2>>8]
+	lwl	$17,2($2)		# Te2[s3>>8]
+	lwl	$18,2($24)		# Te2[s0>>8]
+	lwl	$19,2($25)		# Te2[s1>>8]
+	lwr	$16,1($1)		# Te2[s2>>8]
+	sll	$1,$11,2
+	lwr	$17,1($2)		# Te2[s3>>8]
+	sll	$2,$8,2
+	lwr	$18,1($24)		# Te2[s0>>8]
+	sll	$24,$9,2
+	lwr	$19,1($25)		# Te2[s1>>8]
+	sll	$25,$10,2
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lwl	$20,1($1)		# Te3[s3]
+	lwl	$21,1($2)		# Te3[s0]
+	lwl	$22,1($24)		# Te3[s1]
+	lwl	$23,1($25)		# Te3[s2]
+	lwr	$20,0($1)		# Te3[s3]
+	srl	$1,$8,22
+	lwr	$21,0($2)		# Te3[s0]
+	srl	$2,$9,22
+	lwr	$22,0($24)		# Te3[s1]
+	srl	$24,$10,22
+	lwr	$23,0($25)		# Te3[s2]
+	srl	$25,$11,22
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+#endif
+	xor	$12,$16
+	lw	$16,0($1)		# Te0[s0>>24]
+	xor	$13,$17
+	lw	$17,0($2)		# Te0[s1>>24]
+	xor	$14,$18
+	lw	$18,0($24)		# Te0[s2>>24]
+	xor	$15,$19
+	lw	$19,0($25)		# Te0[s3>>24]
+
+	xor	$12,$20
+	lw	$8,0($3)
+	xor	$13,$21
+	lw	$9,4($3)
+	xor	$14,$22
+	lw	$10,8($3)
+	xor	$15,$23
+	lw	$11,12($3)
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+	sub	$30,1
+	add $3,16
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+	.set	noreorder
+	bnez	$30,.Loop_enc
+	srl	$1,$9,14
+#endif
+
+	.set	reorder
+	srl	$2,$10,14
+	srl	$24,$11,14
+	srl	$25,$8,14
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$12,2($1)		# Te4[s1>>16]
+	srl	$1,$10,6
+	lbu	$13,2($2)		# Te4[s2>>16]
+	srl	$2,$11,6
+	lbu	$14,2($24)		# Te4[s3>>16]
+	srl	$24,$8,6
+	lbu	$15,2($25)		# Te4[s0>>16]
+	srl	$25,$9,6
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+# if defined(_MIPSEL)
+	lbu	$16,2($1)		# Te4[s2>>8]
+	ins $1,$8,2,8
+	lbu	$17,2($2)		# Te4[s3>>8]
+	ins $2,$9,2,8
+	lbu	$18,2($24)		# Te4[s0>>8]
+	ins $24,$10,2,8
+	lbu	$19,2($25)		# Te4[s1>>8]
+	ins $25,$11,2,8
+
+	lbu	$20,2($1)		# Te4[s0>>24]
+	sll	$1,$11,2
+	lbu	$21,2($2)		# Te4[s1>>24]
+	sll	$2,$8,2
+	lbu	$22,2($24)		# Te4[s2>>24]
+	sll	$24,$9,2
+	lbu	$23,2($25)		# Te4[s3>>24]
+	sll	$25,$10,2
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+# else
+	lbu	$16,2($1)		# Te4[s2>>8]
+	srl	$1,$8,22
+	lbu	$17,2($2)		# Te4[s3>>8]
+	srl	$2,$9,22
+	lbu	$18,2($24)		# Te4[s0>>8]
+	srl	$24,$10,22
+	lbu	$19,2($25)		# Te4[s1>>8]
+	srl	$25,$11,22
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$20,2($1)		# Te4[s0>>24]
+	ins $1,$11,2,8
+	lbu	$21,2($2)		# Te4[s1>>24]
+	ins $2,$8,2,8
+	lbu	$22,2($24)		# Te4[s2>>24]
+	ins $24,$9,2,8
+	lbu	$23,2($25)		# Te4[s3>>24]
+	ins $25,$10,2,8
+# endif
+	sll	$12,$12,16
+	sll	$13,$13,16
+	sll	$14,$14,16
+	sll	$15,$15,16
+
+	ins	$12,$16,8,8
+	lbu	$16,2($1)		# Te4[s3]
+	ins	$13,$17,8,8
+	lbu	$17,2($2)		# Te4[s0]
+	ins	$14,$18,8,8
+	lbu	$18,2($24)		# Te4[s1]
+	ins	$15,$19,8,8
+	lbu	$19,2($25)		# Te4[s2]
+
+	ins	$12,$20,24,8
+	lw	$8,0($3)
+	ins	$13,$21,24,8
+	lw	$9,4($3)
+	ins	$14,$22,24,8
+	lw	$10,8($3)
+	ins	$15,$23,24,8
+	lw	$11,12($3)
+
+	ins	$12,$16,0,8
+	ins	$13,$17,0,8
+	ins	$14,$18,0,8
+	ins	$15,$19,0,8
+#else
+	lbu	$16,2($1)		# Te4[s2>>8]
+	srl	$1,$8,22
+	lbu	$17,2($2)		# Te4[s3>>8]
+	srl	$2,$9,22
+	lbu	$18,2($24)		# Te4[s0>>8]
+	srl	$24,$10,22
+	lbu	$19,2($25)		# Te4[s1>>8]
+	srl	$25,$11,22
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$20,2($1)		# Te4[s0>>24]
+	sll	$1,$11,2
+	lbu	$21,2($2)		# Te4[s1>>24]
+	sll	$2,$8,2
+	lbu	$22,2($24)		# Te4[s2>>24]
+	sll	$24,$9,2
+	lbu	$23,2($25)		# Te4[s3>>24]
+	sll	$25,$10,2
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+
+	sll	$12,$12,16
+	sll	$13,$13,16
+	sll	$14,$14,16
+	sll	$15,$15,16
+
+	sll	$16,$16,8
+	sll	$17,$17,8
+	sll	$18,$18,8
+	sll	$19,$19,8
+
+	xor	$12,$16
+	lbu	$16,2($1)		# Te4[s3]
+	xor	$13,$17
+	lbu	$17,2($2)		# Te4[s0]
+	xor	$14,$18
+	lbu	$18,2($24)		# Te4[s1]
+	xor	$15,$19
+	lbu	$19,2($25)		# Te4[s2]
+
+	sll	$20,$20,24
+	lw	$8,0($3)
+	sll	$21,$21,24
+	lw	$9,4($3)
+	sll	$22,$22,24
+	lw	$10,8($3)
+	sll	$23,$23,24
+	lw	$11,12($3)
+
+	xor	$12,$20
+	xor	$13,$21
+	xor	$14,$22
+	xor	$15,$23
+
+	#sll	$16,$16,0
+	#sll	$17,$17,0
+	#sll	$18,$18,0
+	#sll	$19,$19,0
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+#endif
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+
+	jr	$31
+.end	_mips_AES_encrypt
+
+.align	5
+.globl	AES_encrypt
+.ent	AES_encrypt
+AES_encrypt:
+	.frame	$29,64,$31
+	.mask	0xc0ff0000,-4
+	.set	noreorder
+	.cpload	$25
+	sub $29,64
+	sw	$31,64-1*4($29)
+	sw	$30,64-2*4($29)
+	sw	$23,64-3*4($29)
+	sw	$22,64-4*4($29)
+	sw	$21,64-5*4($29)
+	sw	$20,64-6*4($29)
+	sw	$19,64-7*4($29)
+	sw	$18,64-8*4($29)
+	sw	$17,64-9*4($29)
+	sw	$16,64-10*4($29)
+	.set	reorder
+	la	$7,AES_Te		# PIC-ified 'load address'
+
+	lwl	$8,0+0($4)
+	lwl	$9,4+0($4)
+	lwl	$10,8+0($4)
+	lwl	$11,12+0($4)
+	lwr	$8,0+3($4)
+	lwr	$9,4+3($4)
+	lwr	$10,8+3($4)
+	lwr	$11,12+3($4)
+
+	bal	_mips_AES_encrypt
+
+	swr	$8,0+3($5)
+	swr	$9,4+3($5)
+	swr	$10,8+3($5)
+	swr	$11,12+3($5)
+	swl	$8,0+0($5)
+	swl	$9,4+0($5)
+	swl	$10,8+0($5)
+	swl	$11,12+0($5)
+
+	.set	noreorder
+	lw	$31,64-1*4($29)
+	lw	$30,64-2*4($29)
+	lw	$23,64-3*4($29)
+	lw	$22,64-4*4($29)
+	lw	$21,64-5*4($29)
+	lw	$20,64-6*4($29)
+	lw	$19,64-7*4($29)
+	lw	$18,64-8*4($29)
+	lw	$17,64-9*4($29)
+	lw	$16,64-10*4($29)
+	jr	$31
+	add $29,64
+.end	AES_encrypt
+.align	5
+.ent	_mips_AES_decrypt
+_mips_AES_decrypt:
+	.frame	$29,0,$31
+	.set	reorder
+	lw	$12,0($6)
+	lw	$13,4($6)
+	lw	$14,8($6)
+	lw	$15,12($6)
+	lw	$30,240($6)
+	add $3,$6,16
+
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+
+	sub	$30,1
+#if defined(__mips_smartmips)
+	ext	$1,$11,16,8
+.Loop_dec:
+	ext	$2,$8,16,8
+	ext	$24,$9,16,8
+	ext	$25,$10,16,8
+	lwxs	$12,$1($7)		# Td1[s3>>16]
+	ext	$1,$10,8,8
+	lwxs	$13,$2($7)		# Td1[s0>>16]
+	ext	$2,$11,8,8
+	lwxs	$14,$24($7)		# Td1[s1>>16]
+	ext	$24,$8,8,8
+	lwxs	$15,$25($7)		# Td1[s2>>16]
+	ext	$25,$9,8,8
+
+	lwxs	$16,$1($7)		# Td2[s2>>8]
+	ext	$1,$9,0,8
+	lwxs	$17,$2($7)		# Td2[s3>>8]
+	ext	$2,$10,0,8
+	lwxs	$18,$24($7)		# Td2[s0>>8]
+	ext	$24,$11,0,8
+	lwxs	$19,$25($7)		# Td2[s1>>8]
+	ext	$25,$8,0,8
+
+	lwxs	$20,$1($7)		# Td3[s1]
+	ext	$1,$8,24,8
+	lwxs	$21,$2($7)		# Td3[s2]
+	ext	$2,$9,24,8
+	lwxs	$22,$24($7)		# Td3[s3]
+	ext	$24,$10,24,8
+	lwxs	$23,$25($7)		# Td3[s0]
+	ext	$25,$11,24,8
+
+	rotr	$12,$12,8
+	rotr	$13,$13,8
+	rotr	$14,$14,8
+	rotr	$15,$15,8
+
+	rotr	$16,$16,16
+	rotr	$17,$17,16
+	rotr	$18,$18,16
+	rotr	$19,$19,16
+
+	xor	$12,$16
+	lwxs	$16,$1($7)		# Td0[s0>>24]
+	xor	$13,$17
+	lwxs	$17,$2($7)		# Td0[s1>>24]
+	xor	$14,$18
+	lwxs	$18,$24($7)		# Td0[s2>>24]
+	xor	$15,$19
+	lwxs	$19,$25($7)		# Td0[s3>>24]
+
+	rotr	$20,$20,24
+	lw	$8,0($3)
+	rotr	$21,$21,24
+	lw	$9,4($3)
+	rotr	$22,$22,24
+	lw	$10,8($3)
+	rotr	$23,$23,24
+	lw	$11,12($3)
+
+	xor	$12,$20
+	xor	$13,$21
+	xor	$14,$22
+	xor	$15,$23
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+	sub	$30,1
+	add $3,16
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+	.set	noreorder
+	bnez	$30,.Loop_dec
+	ext	$1,$11,16,8
+
+	srl	$1,$11,14
+#else
+	srl	$1,$11,14
+.Loop_dec:
+	srl	$2,$8,14
+	srl	$24,$9,14
+	srl	$25,$10,14
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+	lw	$12,0($1)		# Td1[s3>>16]
+	srl	$1,$10,6
+	lw	$13,0($2)		# Td1[s0>>16]
+	srl	$2,$11,6
+	lw	$14,0($24)		# Td1[s1>>16]
+	srl	$24,$8,6
+	lw	$15,0($25)		# Td1[s2>>16]
+	srl	$25,$9,6
+#else
+	lwl	$12,3($1)		# Td1[s3>>16]
+	lwl	$13,3($2)		# Td1[s0>>16]
+	lwl	$14,3($24)		# Td1[s1>>16]
+	lwl	$15,3($25)		# Td1[s2>>16]
+	lwr	$12,2($1)		# Td1[s3>>16]
+	srl	$1,$10,6
+	lwr	$13,2($2)		# Td1[s0>>16]
+	srl	$2,$11,6
+	lwr	$14,2($24)		# Td1[s1>>16]
+	srl	$24,$8,6
+	lwr	$15,2($25)		# Td1[s2>>16]
+	srl	$25,$9,6
+#endif
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+	rotr	$12,$12,8
+	rotr	$13,$13,8
+	rotr	$14,$14,8
+	rotr	$15,$15,8
+# if defined(_MIPSEL)
+	lw	$16,0($1)		# Td2[s2>>8]
+	sll	$1,$9,2
+	lw	$17,0($2)		# Td2[s3>>8]
+	sll	$2,$10,2
+	lw	$18,0($24)		# Td2[s0>>8]
+	sll	$24,$11,2
+	lw	$19,0($25)		# Td2[s1>>8]
+	sll	$25,$8,2
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lw	$20,0($1)		# Td3[s1]
+	ins $1,$8,2,8
+	lw	$21,0($2)		# Td3[s2]
+	ins $2,$9,2,8
+	lw	$22,0($24)		# Td3[s3]
+	ins $24,$10,2,8
+	lw	$23,0($25)		# Td3[s0]
+	ins $25,$11,2,8
+#else
+	lw	$16,0($1)		# Td2[s2>>8]
+	ins $1,$9,2,8
+	lw	$17,0($2)		# Td2[s3>>8]
+	ins $2,$10,2,8
+	lw	$18,0($24)		# Td2[s0>>8]
+	ins $24,$11,2,8
+	lw	$19,0($25)		# Td2[s1>>8]
+	ins $25,$8,2,8
+
+	lw	$20,0($1)		# Td3[s1]
+	srl	$1,$8,22
+	lw	$21,0($2)		# Td3[s2]
+	srl	$2,$9,22
+	lw	$22,0($24)		# Td3[s3]
+	srl	$24,$10,22
+	lw	$23,0($25)		# Td3[s0]
+	srl	$25,$11,22
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+#endif
+	rotr	$16,$16,16
+	rotr	$17,$17,16
+	rotr	$18,$18,16
+	rotr	$19,$19,16
+
+	rotr	$20,$20,24
+	rotr	$21,$21,24
+	rotr	$22,$22,24
+	rotr	$23,$23,24
+#else
+	lwl	$16,2($1)		# Td2[s2>>8]
+	lwl	$17,2($2)		# Td2[s3>>8]
+	lwl	$18,2($24)		# Td2[s0>>8]
+	lwl	$19,2($25)		# Td2[s1>>8]
+	lwr	$16,1($1)		# Td2[s2>>8]
+	sll	$1,$9,2
+	lwr	$17,1($2)		# Td2[s3>>8]
+	sll	$2,$10,2
+	lwr	$18,1($24)		# Td2[s0>>8]
+	sll	$24,$11,2
+	lwr	$19,1($25)		# Td2[s1>>8]
+	sll	$25,$8,2
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lwl	$20,1($1)		# Td3[s1]
+	lwl	$21,1($2)		# Td3[s2]
+	lwl	$22,1($24)		# Td3[s3]
+	lwl	$23,1($25)		# Td3[s0]
+	lwr	$20,0($1)		# Td3[s1]
+	srl	$1,$8,22
+	lwr	$21,0($2)		# Td3[s2]
+	srl	$2,$9,22
+	lwr	$22,0($24)		# Td3[s3]
+	srl	$24,$10,22
+	lwr	$23,0($25)		# Td3[s0]
+	srl	$25,$11,22
+
+	and	$1,0x3fc
+	and	$2,0x3fc
+	and	$24,0x3fc
+	and	$25,0x3fc
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+#endif
+
+	xor	$12,$16
+	lw	$16,0($1)		# Td0[s0>>24]
+	xor	$13,$17
+	lw	$17,0($2)		# Td0[s1>>24]
+	xor	$14,$18
+	lw	$18,0($24)		# Td0[s2>>24]
+	xor	$15,$19
+	lw	$19,0($25)		# Td0[s3>>24]
+
+	xor	$12,$20
+	lw	$8,0($3)
+	xor	$13,$21
+	lw	$9,4($3)
+	xor	$14,$22
+	lw	$10,8($3)
+	xor	$15,$23
+	lw	$11,12($3)
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+
+	sub	$30,1
+	add $3,16
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+	.set	noreorder
+	bnez	$30,.Loop_dec
+	srl	$1,$11,14
+#endif
+
+	.set	reorder
+	lw	$16,1024($7)		# prefetch Td4
+	srl	$1,$11,16
+	lw	$17,1024+32($7)
+	srl	$2,$8,16
+	lw	$18,1024+64($7)
+	srl	$24,$9,16
+	lw	$19,1024+96($7)
+	srl	$25,$10,16
+	lw	$20,1024+128($7)
+	and	$1,0xff
+	lw	$21,1024+160($7)
+	and	$2,0xff
+	lw	$22,1024+192($7)
+	and	$24,0xff
+	lw	$23,1024+224($7)
+	and	$25,0xff
+
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$12,1024($1)		# Td4[s3>>16]
+	srl	$1,$10,8
+	lbu	$13,1024($2)		# Td4[s0>>16]
+	srl	$2,$11,8
+	lbu	$14,1024($24)		# Td4[s1>>16]
+	srl	$24,$8,8
+	lbu	$15,1024($25)		# Td4[s2>>16]
+	srl	$25,$9,8
+
+	and	$1,0xff
+	and	$2,0xff
+	and	$24,0xff
+	and	$25,0xff
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+# if defined(_MIPSEL)
+	lbu	$16,1024($1)		# Td4[s2>>8]
+	ins $1,$8,0,8
+	lbu	$17,1024($2)		# Td4[s3>>8]
+	ins $2,$9,0,8
+	lbu	$18,1024($24)		# Td4[s0>>8]
+	ins $24,$10,0,8
+	lbu	$19,1024($25)		# Td4[s1>>8]
+	ins $25,$11,0,8
+
+	lbu	$20,1024($1)		# Td4[s0>>24]
+	and	$1,$9,0xff
+	lbu	$21,1024($2)		# Td4[s1>>24]
+	and	$2,$10,0xff
+	lbu	$22,1024($24)		# Td4[s2>>24]
+	and	$24,$11,0xff
+	lbu	$23,1024($25)		# Td4[s3>>24]
+	and	$25,$8,0xff
+
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+# else
+	lbu	$16,1024($1)		# Td4[s2>>8]
+	srl	$1,$8,24
+	lbu	$17,1024($2)		# Td4[s3>>8]
+	srl	$2,$9,24
+	lbu	$18,1024($24)		# Td4[s0>>8]
+	srl	$24,$10,24
+	lbu	$19,1024($25)		# Td4[s1>>8]
+	srl	$25,$11,24
+
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$20,1024($1)		# Td4[s0>>24]
+	ins $1,$9,0,8
+	lbu	$21,1024($2)		# Td4[s1>>24]
+	ins $2,$10,0,8
+	lbu	$22,1024($24)		# Td4[s2>>24]
+	ins $24,$11,0,8
+	lbu	$23,1024($25)		# Td4[s3>>24]
+	ins $25,$8,0,8
+# endif
+	sll	$12,$12,16
+	sll	$13,$13,16
+	sll	$14,$14,16
+	sll	$15,$15,16
+
+	ins	$12,$16,8,8
+	lbu	$16,1024($1)		# Td4[s1]
+	ins	$13,$17,8,8
+	lbu	$17,1024($2)		# Td4[s2]
+	ins	$14,$18,8,8
+	lbu	$18,1024($24)		# Td4[s3]
+	ins	$15,$19,8,8
+	lbu	$19,1024($25)		# Td4[s0]
+
+	ins	$12,$20,24,8
+	lw	$8,0($3)
+	ins	$13,$21,24,8
+	lw	$9,4($3)
+	ins	$14,$22,24,8
+	lw	$10,8($3)
+	ins	$15,$23,24,8
+	lw	$11,12($3)
+
+	ins	$12,$16,0,8
+	ins	$13,$17,0,8
+	ins	$14,$18,0,8
+	ins	$15,$19,0,8
+#else
+	lbu	$16,1024($1)		# Td4[s2>>8]
+	srl	$1,$8,24
+	lbu	$17,1024($2)		# Td4[s3>>8]
+	srl	$2,$9,24
+	lbu	$18,1024($24)		# Td4[s0>>8]
+	srl	$24,$10,24
+	lbu	$19,1024($25)		# Td4[s1>>8]
+	srl	$25,$11,24
+
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$20,1024($1)		# Td4[s0>>24]
+	and	$1,$9,0xff
+	lbu	$21,1024($2)		# Td4[s1>>24]
+	and	$2,$10,0xff
+	lbu	$22,1024($24)		# Td4[s2>>24]
+	and	$24,$11,0xff
+	lbu	$23,1024($25)		# Td4[s3>>24]
+	and	$25,$8,0xff
+
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+
+	sll	$12,$12,16
+	sll	$13,$13,16
+	sll	$14,$14,16
+	sll	$15,$15,16
+
+	sll	$16,$16,8
+	sll	$17,$17,8
+	sll	$18,$18,8
+	sll	$19,$19,8
+
+	xor	$12,$16
+	lbu	$16,1024($1)		# Td4[s1]
+	xor	$13,$17
+	lbu	$17,1024($2)		# Td4[s2]
+	xor	$14,$18
+	lbu	$18,1024($24)		# Td4[s3]
+	xor	$15,$19
+	lbu	$19,1024($25)		# Td4[s0]
+
+	sll	$20,$20,24
+	lw	$8,0($3)
+	sll	$21,$21,24
+	lw	$9,4($3)
+	sll	$22,$22,24
+	lw	$10,8($3)
+	sll	$23,$23,24
+	lw	$11,12($3)
+
+	xor	$12,$20
+	xor	$13,$21
+	xor	$14,$22
+	xor	$15,$23
+
+	#sll	$16,$16,0
+	#sll	$17,$17,0
+	#sll	$18,$18,0
+	#sll	$19,$19,0
+
+	xor	$12,$16
+	xor	$13,$17
+	xor	$14,$18
+	xor	$15,$19
+#endif
+
+	xor	$8,$12
+	xor	$9,$13
+	xor	$10,$14
+	xor	$11,$15
+
+	jr	$31
+.end	_mips_AES_decrypt
+
+.align	5
+.globl	AES_decrypt
+.ent	AES_decrypt
+AES_decrypt:
+	.frame	$29,64,$31
+	.mask	0xc0ff0000,-4
+	.set	noreorder
+	.cpload	$25
+	sub $29,64
+	sw	$31,64-1*4($29)
+	sw	$30,64-2*4($29)
+	sw	$23,64-3*4($29)
+	sw	$22,64-4*4($29)
+	sw	$21,64-5*4($29)
+	sw	$20,64-6*4($29)
+	sw	$19,64-7*4($29)
+	sw	$18,64-8*4($29)
+	sw	$17,64-9*4($29)
+	sw	$16,64-10*4($29)
+	.set	reorder
+	la	$7,AES_Td		# PIC-ified 'load address'
+
+	lwl	$8,0+0($4)
+	lwl	$9,4+0($4)
+	lwl	$10,8+0($4)
+	lwl	$11,12+0($4)
+	lwr	$8,0+3($4)
+	lwr	$9,4+3($4)
+	lwr	$10,8+3($4)
+	lwr	$11,12+3($4)
+
+	bal	_mips_AES_decrypt
+
+	swr	$8,0+3($5)
+	swr	$9,4+3($5)
+	swr	$10,8+3($5)
+	swr	$11,12+3($5)
+	swl	$8,0+0($5)
+	swl	$9,4+0($5)
+	swl	$10,8+0($5)
+	swl	$11,12+0($5)
+
+	.set	noreorder
+	lw	$31,64-1*4($29)
+	lw	$30,64-2*4($29)
+	lw	$23,64-3*4($29)
+	lw	$22,64-4*4($29)
+	lw	$21,64-5*4($29)
+	lw	$20,64-6*4($29)
+	lw	$19,64-7*4($29)
+	lw	$18,64-8*4($29)
+	lw	$17,64-9*4($29)
+	lw	$16,64-10*4($29)
+	jr	$31
+	add $29,64
+.end	AES_decrypt
+.align	5
+.ent	_mips_AES_set_encrypt_key
+_mips_AES_set_encrypt_key:
+	.frame	$29,0,$31
+	.set	noreorder
+	beqz	$4,.Lekey_done
+	li	$2,-1
+	beqz	$6,.Lekey_done
+	add $3,$7,256
+
+	.set	reorder
+	lwl	$8,0+0($4)	# load 128 bits
+	lwl	$9,4+0($4)
+	lwl	$10,8+0($4)
+	lwl	$11,12+0($4)
+	li	$1,128
+	lwr	$8,0+3($4)
+	lwr	$9,4+3($4)
+	lwr	$10,8+3($4)
+	lwr	$11,12+3($4)
+	.set	noreorder
+	beq	$5,$1,.L128bits
+	li	$30,10
+
+	.set	reorder
+	lwl	$12,16+0($4)	# load 192 bits
+	lwl	$13,20+0($4)
+	li	$1,192
+	lwr	$12,16+3($4)
+	lwr	$13,20+3($4)
+	.set	noreorder
+	beq	$5,$1,.L192bits
+	li	$30,8
+
+	.set	reorder
+	lwl	$14,24+0($4)	# load 256 bits
+	lwl	$15,28+0($4)
+	li	$1,256
+	lwr	$14,24+3($4)
+	lwr	$15,28+3($4)
+	.set	noreorder
+	beq	$5,$1,.L256bits
+	li	$30,7
+
+	b	.Lekey_done
+	li	$2,-2
+
+.align	4
+.L128bits:
+	.set	reorder
+	srl	$1,$11,16
+	srl	$2,$11,8
+	and	$1,0xff
+	and	$2,0xff
+	and	$24,$11,0xff
+	srl	$25,$11,24
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$1,0($1)
+	lbu	$2,0($2)
+	lbu	$24,0($24)
+	lbu	$25,0($25)
+
+	sw	$8,0($6)
+	sw	$9,4($6)
+	sw	$10,8($6)
+	sw	$11,12($6)
+	sub	$30,1
+	add $6,16
+
+	sll	$1,$1,24
+	sll	$2,$2,16
+	sll	$24,$24,8
+	#sll	$25,$25,0
+
+	xor	$8,$1
+	lw	$1,0($3)
+	xor	$8,$2
+	xor	$8,$24
+	xor	$8,$25
+	xor	$8,$1
+
+	xor	$9,$8
+	xor	$10,$9
+	xor	$11,$10
+
+	.set	noreorder
+	bnez	$30,.L128bits
+	add $3,4
+
+	sw	$8,0($6)
+	sw	$9,4($6)
+	sw	$10,8($6)
+	li	$30,10
+	sw	$11,12($6)
+	li	$2,0
+	sw	$30,80($6)
+	b	.Lekey_done
+	sub $6,10*16
+
+.align	4
+.L192bits:
+	.set	reorder
+	srl	$1,$13,16
+	srl	$2,$13,8
+	and	$1,0xff
+	and	$2,0xff
+	and	$24,$13,0xff
+	srl	$25,$13,24
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$1,0($1)
+	lbu	$2,0($2)
+	lbu	$24,0($24)
+	lbu	$25,0($25)
+
+	sw	$8,0($6)
+	sw	$9,4($6)
+	sw	$10,8($6)
+	sw	$11,12($6)
+	sw	$12,16($6)
+	sw	$13,20($6)
+	sub	$30,1
+	add $6,24
+
+	sll	$1,$1,24
+	sll	$2,$2,16
+	sll	$24,$24,8
+	#sll	$25,$25,0
+
+	xor	$8,$1
+	lw	$1,0($3)
+	xor	$8,$2
+	xor	$8,$24
+	xor	$8,$25
+	xor	$8,$1
+
+	xor	$9,$8
+	xor	$10,$9
+	xor	$11,$10
+	xor	$12,$11
+	xor	$13,$12
+
+	.set	noreorder
+	bnez	$30,.L192bits
+	add $3,4
+
+	sw	$8,0($6)
+	sw	$9,4($6)
+	sw	$10,8($6)
+	li	$30,12
+	sw	$11,12($6)
+	li	$2,0
+	sw	$30,48($6)
+	b	.Lekey_done
+	sub $6,12*16
+
+.align	4
+.L256bits:
+	.set	reorder
+	srl	$1,$15,16
+	srl	$2,$15,8
+	and	$1,0xff
+	and	$2,0xff
+	and	$24,$15,0xff
+	srl	$25,$15,24
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$1,0($1)
+	lbu	$2,0($2)
+	lbu	$24,0($24)
+	lbu	$25,0($25)
+
+	sw	$8,0($6)
+	sw	$9,4($6)
+	sw	$10,8($6)
+	sw	$11,12($6)
+	sw	$12,16($6)
+	sw	$13,20($6)
+	sw	$14,24($6)
+	sw	$15,28($6)
+	sub	$30,1
+
+	sll	$1,$1,24
+	sll	$2,$2,16
+	sll	$24,$24,8
+	#sll	$25,$25,0
+
+	xor	$8,$1
+	lw	$1,0($3)
+	xor	$8,$2
+	xor	$8,$24
+	xor	$8,$25
+	xor	$8,$1
+
+	xor	$9,$8
+	xor	$10,$9
+	xor	$11,$10
+	beqz	$30,.L256bits_done
+
+	srl	$1,$11,24
+	srl	$2,$11,16
+	srl	$24,$11,8
+	and	$25,$11,0xff
+	and	$2,0xff
+	and	$24,0xff
+	add $1,$7
+	add $2,$7
+	add $24,$7
+	add $25,$7
+	lbu	$1,0($1)
+	lbu	$2,0($2)
+	lbu	$24,0($24)
+	lbu	$25,0($25)
+	sll	$1,24
+	sll	$2,16
+	sll	$24,8
+
+	xor	$12,$1
+	xor	$12,$2
+	xor	$12,$24
+	xor	$12,$25
+
+	xor	$13,$12
+	xor	$14,$13
+	xor	$15,$14
+
+	add $6,32
+	.set	noreorder
+	b	.L256bits
+	add $3,4
+
+.L256bits_done:
+	sw	$8,32($6)
+	sw	$9,36($6)
+	sw	$10,40($6)
+	li	$30,14
+	sw	$11,44($6)
+	li	$2,0
+	sw	$30,48($6)
+	sub $6,12*16
+
+.Lekey_done:
+	jr	$31
+	nop
+.end	_mips_AES_set_encrypt_key
+
+.globl	AES_set_encrypt_key
+.ent	AES_set_encrypt_key
+AES_set_encrypt_key:
+	.frame	$29,32,$31
+	.mask	0xc0000000,-4
+	.set	noreorder
+	.cpload	$25
+	sub $29,32
+	sw	$31,32-1*4($29)
+	sw	$30,32-2*4($29)
+	.set	reorder
+	la	$7,AES_Te4		# PIC-ified 'load address'
+
+	bal	_mips_AES_set_encrypt_key
+
+	.set	noreorder
+	move	$4,$2
+	lw	$31,32-1*4($29)
+	lw	$30,32-2*4($29)
+	jr	$31
+	add $29,32
+.end	AES_set_encrypt_key
+.align	5
+.globl	AES_set_decrypt_key
+.ent	AES_set_decrypt_key
+AES_set_decrypt_key:
+	.frame	$29,32,$31
+	.mask	0xc0000000,-4
+	.set	noreorder
+	.cpload	$25
+	sub $29,32
+	sw	$31,32-1*4($29)
+	sw	$30,32-2*4($29)
+	.set	reorder
+	la	$7,AES_Te4		# PIC-ified 'load address'
+
+	bal	_mips_AES_set_encrypt_key
+
+	bltz	$2,.Ldkey_done
+
+	sll	$1,$30,4
+	add $4,$6,0
+	add $5,$6,$1
+.align	4
+.Lswap:
+	lw	$8,0($4)
+	lw	$9,4($4)
+	lw	$10,8($4)
+	lw	$11,12($4)
+	lw	$12,0($5)
+	lw	$13,4($5)
+	lw	$14,8($5)
+	lw	$15,12($5)
+	sw	$8,0($5)
+	sw	$9,4($5)
+	sw	$10,8($5)
+	sw	$11,12($5)
+	add $4,16
+	sub $5,16
+	sw	$12,-16($4)
+	sw	$13,-12($4)
+	sw	$14,-8($4)
+	sw	$15,-4($4)
+	bne	$4,$5,.Lswap
+
+	lw	$8,16($6)		# modulo-scheduled
+	lui	$2,0x8080
+	sub	$30,1
+	or	$2,0x8080
+	sll	$30,2
+	add $6,16
+	lui	$25,0x1b1b
+	nor	$24,$0,$2
+	or	$25,0x1b1b
+.align	4
+.Lmix:
+	and	$1,$8,$2
+	and	$9,$8,$24
+	srl	$10,$1,7
+	addu	$9,$9		# tp2<<1
+	subu	$1,$10
+	and	$1,$25
+	xor	$9,$1
+
+	and	$1,$9,$2
+	and	$10,$9,$24
+	srl	$11,$1,7
+	addu	$10,$10		# tp4<<1
+	subu	$1,$11
+	and	$1,$25
+	xor	$10,$1
+
+	and	$1,$10,$2
+	and	$11,$10,$24
+	srl	$12,$1,7
+	addu	$11,$11		# tp8<<1
+	subu	$1,$12
+	and	$1,$25
+	xor	$11,$1
+
+	xor	$12,$11,$8
+	xor	$15,$11,$10
+	xor	$13,$12,$9
+	xor	$14,$12,$10
+
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+	rotr	$8,$14,16
+	 xor	$15,$9
+	rotr	$9,$12,8
+	xor	$15,$8
+	rotr	$10,$13,24
+	xor	$15,$9
+	lw	$8,4($6)		# modulo-scheduled
+	xor	$15,$10
+#else
+	srl	$8,$14,16
+	 xor	$15,$9
+	sll	$9,$14,16
+	xor	$15,$8
+	srl	$8,$12,8
+	xor	$15,$9
+	sll	$9,$12,24
+	xor	$15,$8
+	srl	$8,$13,24
+	xor	$15,$9
+	sll	$9,$13,8
+	xor	$15,$8
+	lw	$8,4($6)		# modulo-scheduled
+	xor	$15,$9
+#endif
+	sub	$30,1
+	sw	$15,0($6)
+	add $6,4
+	bnez	$30,.Lmix
+
+	li	$2,0
+.Ldkey_done:
+	.set	noreorder
+	move	$4,$2
+	lw	$31,32-1*4($29)
+	lw	$30,32-2*4($29)
+	jr	$31
+	add $29,32
+.end	AES_set_decrypt_key
+.rdata
+.align	10
+AES_Te:
+.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84	# Te0
+.byte	0xee,0x77,0x77,0x99,	0xf6,0x7b,0x7b,0x8d
+.byte	0xff,0xf2,0xf2,0x0d,	0xd6,0x6b,0x6b,0xbd
+.byte	0xde,0x6f,0x6f,0xb1,	0x91,0xc5,0xc5,0x54
+.byte	0x60,0x30,0x30,0x50,	0x02,0x01,0x01,0x03
+.byte	0xce,0x67,0x67,0xa9,	0x56,0x2b,0x2b,0x7d
+.byte	0xe7,0xfe,0xfe,0x19,	0xb5,0xd7,0xd7,0x62
+.byte	0x4d,0xab,0xab,0xe6,	0xec,0x76,0x76,0x9a
+.byte	0x8f,0xca,0xca,0x45,	0x1f,0x82,0x82,0x9d
+.byte	0x89,0xc9,0xc9,0x40,	0xfa,0x7d,0x7d,0x87
+.byte	0xef,0xfa,0xfa,0x15,	0xb2,0x59,0x59,0xeb
+.byte	0x8e,0x47,0x47,0xc9,	0xfb,0xf0,0xf0,0x0b
+.byte	0x41,0xad,0xad,0xec,	0xb3,0xd4,0xd4,0x67
+.byte	0x5f,0xa2,0xa2,0xfd,	0x45,0xaf,0xaf,0xea
+.byte	0x23,0x9c,0x9c,0xbf,	0x53,0xa4,0xa4,0xf7
+.byte	0xe4,0x72,0x72,0x96,	0x9b,0xc0,0xc0,0x5b
+.byte	0x75,0xb7,0xb7,0xc2,	0xe1,0xfd,0xfd,0x1c
+.byte	0x3d,0x93,0x93,0xae,	0x4c,0x26,0x26,0x6a
+.byte	0x6c,0x36,0x36,0x5a,	0x7e,0x3f,0x3f,0x41
+.byte	0xf5,0xf7,0xf7,0x02,	0x83,0xcc,0xcc,0x4f
+.byte	0x68,0x34,0x34,0x5c,	0x51,0xa5,0xa5,0xf4
+.byte	0xd1,0xe5,0xe5,0x34,	0xf9,0xf1,0xf1,0x08
+.byte	0xe2,0x71,0x71,0x93,	0xab,0xd8,0xd8,0x73
+.byte	0x62,0x31,0x31,0x53,	0x2a,0x15,0x15,0x3f
+.byte	0x08,0x04,0x04,0x0c,	0x95,0xc7,0xc7,0x52
+.byte	0x46,0x23,0x23,0x65,	0x9d,0xc3,0xc3,0x5e
+.byte	0x30,0x18,0x18,0x28,	0x37,0x96,0x96,0xa1
+.byte	0x0a,0x05,0x05,0x0f,	0x2f,0x9a,0x9a,0xb5
+.byte	0x0e,0x07,0x07,0x09,	0x24,0x12,0x12,0x36
+.byte	0x1b,0x80,0x80,0x9b,	0xdf,0xe2,0xe2,0x3d
+.byte	0xcd,0xeb,0xeb,0x26,	0x4e,0x27,0x27,0x69
+.byte	0x7f,0xb2,0xb2,0xcd,	0xea,0x75,0x75,0x9f
+.byte	0x12,0x09,0x09,0x1b,	0x1d,0x83,0x83,0x9e
+.byte	0x58,0x2c,0x2c,0x74,	0x34,0x1a,0x1a,0x2e
+.byte	0x36,0x1b,0x1b,0x2d,	0xdc,0x6e,0x6e,0xb2
+.byte	0xb4,0x5a,0x5a,0xee,	0x5b,0xa0,0xa0,0xfb
+.byte	0xa4,0x52,0x52,0xf6,	0x76,0x3b,0x3b,0x4d
+.byte	0xb7,0xd6,0xd6,0x61,	0x7d,0xb3,0xb3,0xce
+.byte	0x52,0x29,0x29,0x7b,	0xdd,0xe3,0xe3,0x3e
+.byte	0x5e,0x2f,0x2f,0x71,	0x13,0x84,0x84,0x97
+.byte	0xa6,0x53,0x53,0xf5,	0xb9,0xd1,0xd1,0x68
+.byte	0x00,0x00,0x00,0x00,	0xc1,0xed,0xed,0x2c
+.byte	0x40,0x20,0x20,0x60,	0xe3,0xfc,0xfc,0x1f
+.byte	0x79,0xb1,0xb1,0xc8,	0xb6,0x5b,0x5b,0xed
+.byte	0xd4,0x6a,0x6a,0xbe,	0x8d,0xcb,0xcb,0x46
+.byte	0x67,0xbe,0xbe,0xd9,	0x72,0x39,0x39,0x4b
+.byte	0x94,0x4a,0x4a,0xde,	0x98,0x4c,0x4c,0xd4
+.byte	0xb0,0x58,0x58,0xe8,	0x85,0xcf,0xcf,0x4a
+.byte	0xbb,0xd0,0xd0,0x6b,	0xc5,0xef,0xef,0x2a
+.byte	0x4f,0xaa,0xaa,0xe5,	0xed,0xfb,0xfb,0x16
+.byte	0x86,0x43,0x43,0xc5,	0x9a,0x4d,0x4d,0xd7
+.byte	0x66,0x33,0x33,0x55,	0x11,0x85,0x85,0x94
+.byte	0x8a,0x45,0x45,0xcf,	0xe9,0xf9,0xf9,0x10
+.byte	0x04,0x02,0x02,0x06,	0xfe,0x7f,0x7f,0x81
+.byte	0xa0,0x50,0x50,0xf0,	0x78,0x3c,0x3c,0x44
+.byte	0x25,0x9f,0x9f,0xba,	0x4b,0xa8,0xa8,0xe3
+.byte	0xa2,0x51,0x51,0xf3,	0x5d,0xa3,0xa3,0xfe
+.byte	0x80,0x40,0x40,0xc0,	0x05,0x8f,0x8f,0x8a
+.byte	0x3f,0x92,0x92,0xad,	0x21,0x9d,0x9d,0xbc
+.byte	0x70,0x38,0x38,0x48,	0xf1,0xf5,0xf5,0x04
+.byte	0x63,0xbc,0xbc,0xdf,	0x77,0xb6,0xb6,0xc1
+.byte	0xaf,0xda,0xda,0x75,	0x42,0x21,0x21,0x63
+.byte	0x20,0x10,0x10,0x30,	0xe5,0xff,0xff,0x1a
+.byte	0xfd,0xf3,0xf3,0x0e,	0xbf,0xd2,0xd2,0x6d
+.byte	0x81,0xcd,0xcd,0x4c,	0x18,0x0c,0x0c,0x14
+.byte	0x26,0x13,0x13,0x35,	0xc3,0xec,0xec,0x2f
+.byte	0xbe,0x5f,0x5f,0xe1,	0x35,0x97,0x97,0xa2
+.byte	0x88,0x44,0x44,0xcc,	0x2e,0x17,0x17,0x39
+.byte	0x93,0xc4,0xc4,0x57,	0x55,0xa7,0xa7,0xf2
+.byte	0xfc,0x7e,0x7e,0x82,	0x7a,0x3d,0x3d,0x47
+.byte	0xc8,0x64,0x64,0xac,	0xba,0x5d,0x5d,0xe7
+.byte	0x32,0x19,0x19,0x2b,	0xe6,0x73,0x73,0x95
+.byte	0xc0,0x60,0x60,0xa0,	0x19,0x81,0x81,0x98
+.byte	0x9e,0x4f,0x4f,0xd1,	0xa3,0xdc,0xdc,0x7f
+.byte	0x44,0x22,0x22,0x66,	0x54,0x2a,0x2a,0x7e
+.byte	0x3b,0x90,0x90,0xab,	0x0b,0x88,0x88,0x83
+.byte	0x8c,0x46,0x46,0xca,	0xc7,0xee,0xee,0x29
+.byte	0x6b,0xb8,0xb8,0xd3,	0x28,0x14,0x14,0x3c
+.byte	0xa7,0xde,0xde,0x79,	0xbc,0x5e,0x5e,0xe2
+.byte	0x16,0x0b,0x0b,0x1d,	0xad,0xdb,0xdb,0x76
+.byte	0xdb,0xe0,0xe0,0x3b,	0x64,0x32,0x32,0x56
+.byte	0x74,0x3a,0x3a,0x4e,	0x14,0x0a,0x0a,0x1e
+.byte	0x92,0x49,0x49,0xdb,	0x0c,0x06,0x06,0x0a
+.byte	0x48,0x24,0x24,0x6c,	0xb8,0x5c,0x5c,0xe4
+.byte	0x9f,0xc2,0xc2,0x5d,	0xbd,0xd3,0xd3,0x6e
+.byte	0x43,0xac,0xac,0xef,	0xc4,0x62,0x62,0xa6
+.byte	0x39,0x91,0x91,0xa8,	0x31,0x95,0x95,0xa4
+.byte	0xd3,0xe4,0xe4,0x37,	0xf2,0x79,0x79,0x8b
+.byte	0xd5,0xe7,0xe7,0x32,	0x8b,0xc8,0xc8,0x43
+.byte	0x6e,0x37,0x37,0x59,	0xda,0x6d,0x6d,0xb7
+.byte	0x01,0x8d,0x8d,0x8c,	0xb1,0xd5,0xd5,0x64
+.byte	0x9c,0x4e,0x4e,0xd2,	0x49,0xa9,0xa9,0xe0
+.byte	0xd8,0x6c,0x6c,0xb4,	0xac,0x56,0x56,0xfa
+.byte	0xf3,0xf4,0xf4,0x07,	0xcf,0xea,0xea,0x25
+.byte	0xca,0x65,0x65,0xaf,	0xf4,0x7a,0x7a,0x8e
+.byte	0x47,0xae,0xae,0xe9,	0x10,0x08,0x08,0x18
+.byte	0x6f,0xba,0xba,0xd5,	0xf0,0x78,0x78,0x88
+.byte	0x4a,0x25,0x25,0x6f,	0x5c,0x2e,0x2e,0x72
+.byte	0x38,0x1c,0x1c,0x24,	0x57,0xa6,0xa6,0xf1
+.byte	0x73,0xb4,0xb4,0xc7,	0x97,0xc6,0xc6,0x51
+.byte	0xcb,0xe8,0xe8,0x23,	0xa1,0xdd,0xdd,0x7c
+.byte	0xe8,0x74,0x74,0x9c,	0x3e,0x1f,0x1f,0x21
+.byte	0x96,0x4b,0x4b,0xdd,	0x61,0xbd,0xbd,0xdc
+.byte	0x0d,0x8b,0x8b,0x86,	0x0f,0x8a,0x8a,0x85
+.byte	0xe0,0x70,0x70,0x90,	0x7c,0x3e,0x3e,0x42
+.byte	0x71,0xb5,0xb5,0xc4,	0xcc,0x66,0x66,0xaa
+.byte	0x90,0x48,0x48,0xd8,	0x06,0x03,0x03,0x05
+.byte	0xf7,0xf6,0xf6,0x01,	0x1c,0x0e,0x0e,0x12
+.byte	0xc2,0x61,0x61,0xa3,	0x6a,0x35,0x35,0x5f
+.byte	0xae,0x57,0x57,0xf9,	0x69,0xb9,0xb9,0xd0
+.byte	0x17,0x86,0x86,0x91,	0x99,0xc1,0xc1,0x58
+.byte	0x3a,0x1d,0x1d,0x27,	0x27,0x9e,0x9e,0xb9
+.byte	0xd9,0xe1,0xe1,0x38,	0xeb,0xf8,0xf8,0x13
+.byte	0x2b,0x98,0x98,0xb3,	0x22,0x11,0x11,0x33
+.byte	0xd2,0x69,0x69,0xbb,	0xa9,0xd9,0xd9,0x70
+.byte	0x07,0x8e,0x8e,0x89,	0x33,0x94,0x94,0xa7
+.byte	0x2d,0x9b,0x9b,0xb6,	0x3c,0x1e,0x1e,0x22
+.byte	0x15,0x87,0x87,0x92,	0xc9,0xe9,0xe9,0x20
+.byte	0x87,0xce,0xce,0x49,	0xaa,0x55,0x55,0xff
+.byte	0x50,0x28,0x28,0x78,	0xa5,0xdf,0xdf,0x7a
+.byte	0x03,0x8c,0x8c,0x8f,	0x59,0xa1,0xa1,0xf8
+.byte	0x09,0x89,0x89,0x80,	0x1a,0x0d,0x0d,0x17
+.byte	0x65,0xbf,0xbf,0xda,	0xd7,0xe6,0xe6,0x31
+.byte	0x84,0x42,0x42,0xc6,	0xd0,0x68,0x68,0xb8
+.byte	0x82,0x41,0x41,0xc3,	0x29,0x99,0x99,0xb0
+.byte	0x5a,0x2d,0x2d,0x77,	0x1e,0x0f,0x0f,0x11
+.byte	0x7b,0xb0,0xb0,0xcb,	0xa8,0x54,0x54,0xfc
+.byte	0x6d,0xbb,0xbb,0xd6,	0x2c,0x16,0x16,0x3a
+
+AES_Td:
+.byte	0x51,0xf4,0xa7,0x50,	0x7e,0x41,0x65,0x53	# Td0
+.byte	0x1a,0x17,0xa4,0xc3,	0x3a,0x27,0x5e,0x96
+.byte	0x3b,0xab,0x6b,0xcb,	0x1f,0x9d,0x45,0xf1
+.byte	0xac,0xfa,0x58,0xab,	0x4b,0xe3,0x03,0x93
+.byte	0x20,0x30,0xfa,0x55,	0xad,0x76,0x6d,0xf6
+.byte	0x88,0xcc,0x76,0x91,	0xf5,0x02,0x4c,0x25
+.byte	0x4f,0xe5,0xd7,0xfc,	0xc5,0x2a,0xcb,0xd7
+.byte	0x26,0x35,0x44,0x80,	0xb5,0x62,0xa3,0x8f
+.byte	0xde,0xb1,0x5a,0x49,	0x25,0xba,0x1b,0x67
+.byte	0x45,0xea,0x0e,0x98,	0x5d,0xfe,0xc0,0xe1
+.byte	0xc3,0x2f,0x75,0x02,	0x81,0x4c,0xf0,0x12
+.byte	0x8d,0x46,0x97,0xa3,	0x6b,0xd3,0xf9,0xc6
+.byte	0x03,0x8f,0x5f,0xe7,	0x15,0x92,0x9c,0x95
+.byte	0xbf,0x6d,0x7a,0xeb,	0x95,0x52,0x59,0xda
+.byte	0xd4,0xbe,0x83,0x2d,	0x58,0x74,0x21,0xd3
+.byte	0x49,0xe0,0x69,0x29,	0x8e,0xc9,0xc8,0x44
+.byte	0x75,0xc2,0x89,0x6a,	0xf4,0x8e,0x79,0x78
+.byte	0x99,0x58,0x3e,0x6b,	0x27,0xb9,0x71,0xdd
+.byte	0xbe,0xe1,0x4f,0xb6,	0xf0,0x88,0xad,0x17
+.byte	0xc9,0x20,0xac,0x66,	0x7d,0xce,0x3a,0xb4
+.byte	0x63,0xdf,0x4a,0x18,	0xe5,0x1a,0x31,0x82
+.byte	0x97,0x51,0x33,0x60,	0x62,0x53,0x7f,0x45
+.byte	0xb1,0x64,0x77,0xe0,	0xbb,0x6b,0xae,0x84
+.byte	0xfe,0x81,0xa0,0x1c,	0xf9,0x08,0x2b,0x94
+.byte	0x70,0x48,0x68,0x58,	0x8f,0x45,0xfd,0x19
+.byte	0x94,0xde,0x6c,0x87,	0x52,0x7b,0xf8,0xb7
+.byte	0xab,0x73,0xd3,0x23,	0x72,0x4b,0x02,0xe2
+.byte	0xe3,0x1f,0x8f,0x57,	0x66,0x55,0xab,0x2a
+.byte	0xb2,0xeb,0x28,0x07,	0x2f,0xb5,0xc2,0x03
+.byte	0x86,0xc5,0x7b,0x9a,	0xd3,0x37,0x08,0xa5
+.byte	0x30,0x28,0x87,0xf2,	0x23,0xbf,0xa5,0xb2
+.byte	0x02,0x03,0x6a,0xba,	0xed,0x16,0x82,0x5c
+.byte	0x8a,0xcf,0x1c,0x2b,	0xa7,0x79,0xb4,0x92
+.byte	0xf3,0x07,0xf2,0xf0,	0x4e,0x69,0xe2,0xa1
+.byte	0x65,0xda,0xf4,0xcd,	0x06,0x05,0xbe,0xd5
+.byte	0xd1,0x34,0x62,0x1f,	0xc4,0xa6,0xfe,0x8a
+.byte	0x34,0x2e,0x53,0x9d,	0xa2,0xf3,0x55,0xa0
+.byte	0x05,0x8a,0xe1,0x32,	0xa4,0xf6,0xeb,0x75
+.byte	0x0b,0x83,0xec,0x39,	0x40,0x60,0xef,0xaa
+.byte	0x5e,0x71,0x9f,0x06,	0xbd,0x6e,0x10,0x51
+.byte	0x3e,0x21,0x8a,0xf9,	0x96,0xdd,0x06,0x3d
+.byte	0xdd,0x3e,0x05,0xae,	0x4d,0xe6,0xbd,0x46
+.byte	0x91,0x54,0x8d,0xb5,	0x71,0xc4,0x5d,0x05
+.byte	0x04,0x06,0xd4,0x6f,	0x60,0x50,0x15,0xff
+.byte	0x19,0x98,0xfb,0x24,	0xd6,0xbd,0xe9,0x97
+.byte	0x89,0x40,0x43,0xcc,	0x67,0xd9,0x9e,0x77
+.byte	0xb0,0xe8,0x42,0xbd,	0x07,0x89,0x8b,0x88
+.byte	0xe7,0x19,0x5b,0x38,	0x79,0xc8,0xee,0xdb
+.byte	0xa1,0x7c,0x0a,0x47,	0x7c,0x42,0x0f,0xe9
+.byte	0xf8,0x84,0x1e,0xc9,	0x00,0x00,0x00,0x00
+.byte	0x09,0x80,0x86,0x83,	0x32,0x2b,0xed,0x48
+.byte	0x1e,0x11,0x70,0xac,	0x6c,0x5a,0x72,0x4e
+.byte	0xfd,0x0e,0xff,0xfb,	0x0f,0x85,0x38,0x56
+.byte	0x3d,0xae,0xd5,0x1e,	0x36,0x2d,0x39,0x27
+.byte	0x0a,0x0f,0xd9,0x64,	0x68,0x5c,0xa6,0x21
+.byte	0x9b,0x5b,0x54,0xd1,	0x24,0x36,0x2e,0x3a
+.byte	0x0c,0x0a,0x67,0xb1,	0x93,0x57,0xe7,0x0f
+.byte	0xb4,0xee,0x96,0xd2,	0x1b,0x9b,0x91,0x9e
+.byte	0x80,0xc0,0xc5,0x4f,	0x61,0xdc,0x20,0xa2
+.byte	0x5a,0x77,0x4b,0x69,	0x1c,0x12,0x1a,0x16
+.byte	0xe2,0x93,0xba,0x0a,	0xc0,0xa0,0x2a,0xe5
+.byte	0x3c,0x22,0xe0,0x43,	0x12,0x1b,0x17,0x1d
+.byte	0x0e,0x09,0x0d,0x0b,	0xf2,0x8b,0xc7,0xad
+.byte	0x2d,0xb6,0xa8,0xb9,	0x14,0x1e,0xa9,0xc8
+.byte	0x57,0xf1,0x19,0x85,	0xaf,0x75,0x07,0x4c
+.byte	0xee,0x99,0xdd,0xbb,	0xa3,0x7f,0x60,0xfd
+.byte	0xf7,0x01,0x26,0x9f,	0x5c,0x72,0xf5,0xbc
+.byte	0x44,0x66,0x3b,0xc5,	0x5b,0xfb,0x7e,0x34
+.byte	0x8b,0x43,0x29,0x76,	0xcb,0x23,0xc6,0xdc
+.byte	0xb6,0xed,0xfc,0x68,	0xb8,0xe4,0xf1,0x63
+.byte	0xd7,0x31,0xdc,0xca,	0x42,0x63,0x85,0x10
+.byte	0x13,0x97,0x22,0x40,	0x84,0xc6,0x11,0x20
+.byte	0x85,0x4a,0x24,0x7d,	0xd2,0xbb,0x3d,0xf8
+.byte	0xae,0xf9,0x32,0x11,	0xc7,0x29,0xa1,0x6d
+.byte	0x1d,0x9e,0x2f,0x4b,	0xdc,0xb2,0x30,0xf3
+.byte	0x0d,0x86,0x52,0xec,	0x77,0xc1,0xe3,0xd0
+.byte	0x2b,0xb3,0x16,0x6c,	0xa9,0x70,0xb9,0x99
+.byte	0x11,0x94,0x48,0xfa,	0x47,0xe9,0x64,0x22
+.byte	0xa8,0xfc,0x8c,0xc4,	0xa0,0xf0,0x3f,0x1a
+.byte	0x56,0x7d,0x2c,0xd8,	0x22,0x33,0x90,0xef
+.byte	0x87,0x49,0x4e,0xc7,	0xd9,0x38,0xd1,0xc1
+.byte	0x8c,0xca,0xa2,0xfe,	0x98,0xd4,0x0b,0x36
+.byte	0xa6,0xf5,0x81,0xcf,	0xa5,0x7a,0xde,0x28
+.byte	0xda,0xb7,0x8e,0x26,	0x3f,0xad,0xbf,0xa4
+.byte	0x2c,0x3a,0x9d,0xe4,	0x50,0x78,0x92,0x0d
+.byte	0x6a,0x5f,0xcc,0x9b,	0x54,0x7e,0x46,0x62
+.byte	0xf6,0x8d,0x13,0xc2,	0x90,0xd8,0xb8,0xe8
+.byte	0x2e,0x39,0xf7,0x5e,	0x82,0xc3,0xaf,0xf5
+.byte	0x9f,0x5d,0x80,0xbe,	0x69,0xd0,0x93,0x7c
+.byte	0x6f,0xd5,0x2d,0xa9,	0xcf,0x25,0x12,0xb3
+.byte	0xc8,0xac,0x99,0x3b,	0x10,0x18,0x7d,0xa7
+.byte	0xe8,0x9c,0x63,0x6e,	0xdb,0x3b,0xbb,0x7b
+.byte	0xcd,0x26,0x78,0x09,	0x6e,0x59,0x18,0xf4
+.byte	0xec,0x9a,0xb7,0x01,	0x83,0x4f,0x9a,0xa8
+.byte	0xe6,0x95,0x6e,0x65,	0xaa,0xff,0xe6,0x7e
+.byte	0x21,0xbc,0xcf,0x08,	0xef,0x15,0xe8,0xe6
+.byte	0xba,0xe7,0x9b,0xd9,	0x4a,0x6f,0x36,0xce
+.byte	0xea,0x9f,0x09,0xd4,	0x29,0xb0,0x7c,0xd6
+.byte	0x31,0xa4,0xb2,0xaf,	0x2a,0x3f,0x23,0x31
+.byte	0xc6,0xa5,0x94,0x30,	0x35,0xa2,0x66,0xc0
+.byte	0x74,0x4e,0xbc,0x37,	0xfc,0x82,0xca,0xa6
+.byte	0xe0,0x90,0xd0,0xb0,	0x33,0xa7,0xd8,0x15
+.byte	0xf1,0x04,0x98,0x4a,	0x41,0xec,0xda,0xf7
+.byte	0x7f,0xcd,0x50,0x0e,	0x17,0x91,0xf6,0x2f
+.byte	0x76,0x4d,0xd6,0x8d,	0x43,0xef,0xb0,0x4d
+.byte	0xcc,0xaa,0x4d,0x54,	0xe4,0x96,0x04,0xdf
+.byte	0x9e,0xd1,0xb5,0xe3,	0x4c,0x6a,0x88,0x1b
+.byte	0xc1,0x2c,0x1f,0xb8,	0x46,0x65,0x51,0x7f
+.byte	0x9d,0x5e,0xea,0x04,	0x01,0x8c,0x35,0x5d
+.byte	0xfa,0x87,0x74,0x73,	0xfb,0x0b,0x41,0x2e
+.byte	0xb3,0x67,0x1d,0x5a,	0x92,0xdb,0xd2,0x52
+.byte	0xe9,0x10,0x56,0x33,	0x6d,0xd6,0x47,0x13
+.byte	0x9a,0xd7,0x61,0x8c,	0x37,0xa1,0x0c,0x7a
+.byte	0x59,0xf8,0x14,0x8e,	0xeb,0x13,0x3c,0x89
+.byte	0xce,0xa9,0x27,0xee,	0xb7,0x61,0xc9,0x35
+.byte	0xe1,0x1c,0xe5,0xed,	0x7a,0x47,0xb1,0x3c
+.byte	0x9c,0xd2,0xdf,0x59,	0x55,0xf2,0x73,0x3f
+.byte	0x18,0x14,0xce,0x79,	0x73,0xc7,0x37,0xbf
+.byte	0x53,0xf7,0xcd,0xea,	0x5f,0xfd,0xaa,0x5b
+.byte	0xdf,0x3d,0x6f,0x14,	0x78,0x44,0xdb,0x86
+.byte	0xca,0xaf,0xf3,0x81,	0xb9,0x68,0xc4,0x3e
+.byte	0x38,0x24,0x34,0x2c,	0xc2,0xa3,0x40,0x5f
+.byte	0x16,0x1d,0xc3,0x72,	0xbc,0xe2,0x25,0x0c
+.byte	0x28,0x3c,0x49,0x8b,	0xff,0x0d,0x95,0x41
+.byte	0x39,0xa8,0x01,0x71,	0x08,0x0c,0xb3,0xde
+.byte	0xd8,0xb4,0xe4,0x9c,	0x64,0x56,0xc1,0x90
+.byte	0x7b,0xcb,0x84,0x61,	0xd5,0x32,0xb6,0x70
+.byte	0x48,0x6c,0x5c,0x74,	0xd0,0xb8,0x57,0x42
+
+.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38	# Td4
+.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+AES_Te4:
+.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5	# Te4
+.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.byte	0x01,0x00,0x00,0x00,	0x02,0x00,0x00,0x00	# rcon
+.byte	0x04,0x00,0x00,0x00,	0x08,0x00,0x00,0x00
+.byte	0x10,0x00,0x00,0x00,	0x20,0x00,0x00,0x00
+.byte	0x40,0x00,0x00,0x00,	0x80,0x00,0x00,0x00
+.byte	0x1B,0x00,0x00,0x00,	0x36,0x00,0x00,0x00

From 4d797fb55dda1eb6e9874905bafe36ac674d75e9 Mon Sep 17 00:00:00 2001
From: Linus Yang <laokongzi@gmail.com>
Date: Fri, 18 Aug 2017 22:27:03 +0800
Subject: [PATCH 2/2] Only use assembly code

---
 lib/aes_acc/aes0.c      |  600 --------------------
 lib/aes_acc/aes0.h      |   45 --
 lib/aes_acc/aesacc.c    |  197 +++----
 lib/aes_acc/aesacc.h    |   20 -
 lib/aes_acc/aesni.c     |   13 +-
 lib/aes_acc/aesni.h     |   12 +-
 lib/aes_acc/asm/arm64.S | 1178 +++++++++++++++++++++++++++++++++++++++
 lib/aes_acc/asm/x64.S   |  827 +++++++++++++++++++++++++++
 makefile                |    8 +-
 9 files changed, 2108 insertions(+), 792 deletions(-)
 delete mode 100755 lib/aes_acc/aes0.c
 delete mode 100755 lib/aes_acc/aes0.h
 delete mode 100644 lib/aes_acc/aesacc.h
 create mode 100644 lib/aes_acc/asm/arm64.S
 create mode 100644 lib/aes_acc/asm/x64.S

diff --git a/lib/aes_acc/aes0.c b/lib/aes_acc/aes0.c
deleted file mode 100755
index 2fdf02b..0000000
--- a/lib/aes_acc/aes0.c
+++ /dev/null
@@ -1,600 +0,0 @@
-
-/*
- *  this file comes from https://github.com/kokke/tiny-AES128-C
- */
-
-/*
-
-This is an implementation of the AES algorithm, specifically ECB and CBC mode.
-Block size can be chosen in aes.h - available choices are AES128, AES192, AES256.
-
-The implementation is verified against the test vectors in:
-  National Institute of Standards and Technology Special Publication 800-38A 2001 ED
-
-ECB-AES128
-----------
-
-  plain-text:
-    6bc1bee22e409f96e93d7e117393172a
-    ae2d8a571e03ac9c9eb76fac45af8e51
-    30c81c46a35ce411e5fbc1191a0a52ef
-    f69f2445df4f9b17ad2b417be66c3710
-
-  key:
-    2b7e151628aed2a6abf7158809cf4f3c
-
-  resulting cipher
-    3ad77bb40d7a3660a89ecaf32466ef97 
-    f5d3d58503b9699de785895a96fdbaaf 
-    43b1cd7f598ece23881b00e3ed030688 
-    7b0c785e27e8ad3f8223207104725dd4 
-
-
-NOTE:   String length must be evenly divisible by 16byte (str_len % 16 == 0)
-        You should pad the end of the string with zeros if this is not the case.
-        For AES192/256 the block size is proportionally larger.
-
-*/
-
-
-/*****************************************************************************/
-/* Includes:                                                                 */
-/*****************************************************************************/
-#include <stdint.h>
-#include <string.h> // CBC mode, for memset
-#include "aes0.h"
-
-/*****************************************************************************/
-/* Defines:                                                                  */
-/*****************************************************************************/
-// The number of columns comprising a state in AES. This is a constant in AES. Value=4
-#define Nb 4
-#define BLOCKLEN 16 //Block length in bytes AES is 128b block only
-
-#if defined(AES256) && (AES256 == 1)
-    #define Nk 8
-    #define KEYLEN 32
-    #define Nr 14
-    #define keyExpSize 240
-#elif defined(AES192) && (AES192 == 1)
-    #define Nk 6
-    #define KEYLEN 24
-    #define Nr 12
-    #define keyExpSize 208
-#else
-    #define Nk 4        // The number of 32 bit words in a key.
-    #define KEYLEN 16   // Key length in bytes
-    #define Nr 10       // The number of rounds in AES Cipher.
-    #define keyExpSize 176
-#endif
-
-// jcallan@github points out that declaring Multiply as a function 
-// reduces code size considerably with the Keil ARM compiler.
-// See this link for more information: https://github.com/kokke/tiny-AES128-C/pull/3
-#ifndef MULTIPLY_AS_A_FUNCTION
-  #define MULTIPLY_AS_A_FUNCTION 0
-#endif
-
-
-/*****************************************************************************/
-/* Private variables:                                                        */
-/*****************************************************************************/
-// state - array holding the intermediate results during decryption.
-typedef uint8_t state_t[4][4];
-static state_t* state;
-
-// The array that stores the round keys.
-static uint8_t RoundKey[keyExpSize];
-
-// The Key input to the AES Program
-static const uint8_t* Key;
-
-#if defined(CBC) && CBC
-  // Initial Vector used only for CBC mode
-  static uint8_t* Iv;
-#endif
-
-// The lookup-tables are marked const so they can be placed in read-only storage instead of RAM
-// The numbers below can be computed dynamically trading ROM for RAM - 
-// This can be useful in (embedded) bootloader applications, where ROM is often limited.
-static const uint8_t sbox[256] = {
-  //0     1    2      3     4    5     6     7      8    9     A      B    C     D     E     F
-  0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
-  0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
-  0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
-  0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
-  0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
-  0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
-  0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
-  0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
-  0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
-  0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
-  0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
-  0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
-  0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
-  0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
-  0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-  0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 };
-
-static const uint8_t rsbox[256] = {
-  0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
-  0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
-  0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
-  0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
-  0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
-  0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
-  0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
-  0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
-  0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
-  0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
-  0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
-  0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
-  0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
-  0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
-  0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
-  0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d };
-
-// The round constant word array, Rcon[i], contains the values given by 
-// x to th e power (i-1) being powers of x (x is denoted as {02}) in the field GF(2^8)
-static const uint8_t Rcon[11] = {
-  0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
-
-/*
- * Jordan Goulder points out in PR #12 (https://github.com/kokke/tiny-AES128-C/pull/12),
- * that you can remove most of the elements in the Rcon array, because they are unused.
- *
- * From Wikipedia's article on the Rijndael key schedule @ https://en.wikipedia.org/wiki/Rijndael_key_schedule#Rcon
- * 
- * "Only the first some of these constants are actually used – up to rcon[10] for AES-128 (as 11 round keys are needed), 
- *  up to rcon[8] for AES-192, up to rcon[7] for AES-256. rcon[0] is not used in AES algorithm."
- *
- * ... which is why the full array below has been 'disabled' below.
- */
-#if 0
-static const uint8_t Rcon[256] = {
-  0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a,
-  0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39,
-  0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a,
-  0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8,
-  0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef,
-  0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc,
-  0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b,
-  0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3,
-  0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94,
-  0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20,
-  0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35,
-  0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f,
-  0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04,
-  0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63,
-  0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd,
-  0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d };
-#endif
-
-/*****************************************************************************/
-/* Private functions:                                                        */
-/*****************************************************************************/
-static uint8_t getSBoxValue(uint8_t num)
-{
-  return sbox[num];
-}
-
-static uint8_t getSBoxInvert(uint8_t num)
-{
-  return rsbox[num];
-}
-
-// This function produces Nb(Nr+1) round keys. The round keys are used in each round to decrypt the states. 
-static void KeyExpansion(void)
-{
-  uint32_t i, k;
-  uint8_t tempa[4]; // Used for the column/row operations
-  
-  // The first round key is the key itself.
-  for (i = 0; i < Nk; ++i)
-  {
-    RoundKey[(i * 4) + 0] = Key[(i * 4) + 0];
-    RoundKey[(i * 4) + 1] = Key[(i * 4) + 1];
-    RoundKey[(i * 4) + 2] = Key[(i * 4) + 2];
-    RoundKey[(i * 4) + 3] = Key[(i * 4) + 3];
-  }
-
-  // All other round keys are found from the previous round keys.
-  //i == Nk
-  for (; i < Nb * (Nr + 1); ++i)
-  {
-    {
-      tempa[0]=RoundKey[(i-1) * 4 + 0];
-      tempa[1]=RoundKey[(i-1) * 4 + 1];
-      tempa[2]=RoundKey[(i-1) * 4 + 2];
-      tempa[3]=RoundKey[(i-1) * 4 + 3];
-    }
-
-    if (i % Nk == 0)
-    {
-      // This function shifts the 4 bytes in a word to the left once.
-      // [a0,a1,a2,a3] becomes [a1,a2,a3,a0]
-
-      // Function RotWord()
-      {
-        k = tempa[0];
-        tempa[0] = tempa[1];
-        tempa[1] = tempa[2];
-        tempa[2] = tempa[3];
-        tempa[3] = k;
-      }
-
-      // SubWord() is a function that takes a four-byte input word and 
-      // applies the S-box to each of the four bytes to produce an output word.
-
-      // Function Subword()
-      {
-        tempa[0] = getSBoxValue(tempa[0]);
-        tempa[1] = getSBoxValue(tempa[1]);
-        tempa[2] = getSBoxValue(tempa[2]);
-        tempa[3] = getSBoxValue(tempa[3]);
-      }
-
-      tempa[0] =  tempa[0] ^ Rcon[i/Nk];
-    }
-#if defined(AES256) && (AES256 == 1)
-    if (i % Nk == 4)
-    {
-      // Function Subword()
-      {
-        tempa[0] = getSBoxValue(tempa[0]);
-        tempa[1] = getSBoxValue(tempa[1]);
-        tempa[2] = getSBoxValue(tempa[2]);
-        tempa[3] = getSBoxValue(tempa[3]);
-      }
-    }
-#endif
-    RoundKey[i * 4 + 0] = RoundKey[(i - Nk) * 4 + 0] ^ tempa[0];
-    RoundKey[i * 4 + 1] = RoundKey[(i - Nk) * 4 + 1] ^ tempa[1];
-    RoundKey[i * 4 + 2] = RoundKey[(i - Nk) * 4 + 2] ^ tempa[2];
-    RoundKey[i * 4 + 3] = RoundKey[(i - Nk) * 4 + 3] ^ tempa[3];
-  }
-}
-
-// This function adds the round key to state.
-// The round key is added to the state by an XOR function.
-static void AddRoundKey(uint8_t round)
-{
-  uint8_t i,j;
-  for (i=0;i<4;++i)
-  {
-    for (j = 0; j < 4; ++j)
-    {
-      (*state)[i][j] ^= RoundKey[round * Nb * 4 + i * Nb + j];
-    }
-  }
-}
-
-// The SubBytes Function Substitutes the values in the
-// state matrix with values in an S-box.
-static void SubBytes(void)
-{
-  uint8_t i, j;
-  for (i = 0; i < 4; ++i)
-  {
-    for (j = 0; j < 4; ++j)
-    {
-      (*state)[j][i] = getSBoxValue((*state)[j][i]);
-    }
-  }
-}
-
-// The ShiftRows() function shifts the rows in the state to the left.
-// Each row is shifted with different offset.
-// Offset = Row number. So the first row is not shifted.
-static void ShiftRows(void)
-{
-  uint8_t temp;
-
-  // Rotate first row 1 columns to left  
-  temp           = (*state)[0][1];
-  (*state)[0][1] = (*state)[1][1];
-  (*state)[1][1] = (*state)[2][1];
-  (*state)[2][1] = (*state)[3][1];
-  (*state)[3][1] = temp;
-
-  // Rotate second row 2 columns to left  
-  temp           = (*state)[0][2];
-  (*state)[0][2] = (*state)[2][2];
-  (*state)[2][2] = temp;
-
-  temp           = (*state)[1][2];
-  (*state)[1][2] = (*state)[3][2];
-  (*state)[3][2] = temp;
-
-  // Rotate third row 3 columns to left
-  temp           = (*state)[0][3];
-  (*state)[0][3] = (*state)[3][3];
-  (*state)[3][3] = (*state)[2][3];
-  (*state)[2][3] = (*state)[1][3];
-  (*state)[1][3] = temp;
-}
-
-static uint8_t xtime(uint8_t x)
-{
-  return ((x<<1) ^ (((x>>7) & 1) * 0x1b));
-}
-
-// MixColumns function mixes the columns of the state matrix
-static void MixColumns(void)
-{
-  uint8_t i;
-  uint8_t Tmp,Tm,t;
-  for (i = 0; i < 4; ++i)
-  {  
-    t   = (*state)[i][0];
-    Tmp = (*state)[i][0] ^ (*state)[i][1] ^ (*state)[i][2] ^ (*state)[i][3] ;
-    Tm  = (*state)[i][0] ^ (*state)[i][1] ; Tm = xtime(Tm);  (*state)[i][0] ^= Tm ^ Tmp ;
-    Tm  = (*state)[i][1] ^ (*state)[i][2] ; Tm = xtime(Tm);  (*state)[i][1] ^= Tm ^ Tmp ;
-    Tm  = (*state)[i][2] ^ (*state)[i][3] ; Tm = xtime(Tm);  (*state)[i][2] ^= Tm ^ Tmp ;
-    Tm  = (*state)[i][3] ^ t ;              Tm = xtime(Tm);  (*state)[i][3] ^= Tm ^ Tmp ;
-  }
-}
-
-// Multiply is used to multiply numbers in the field GF(2^8)
-#if MULTIPLY_AS_A_FUNCTION
-static uint8_t Multiply(uint8_t x, uint8_t y)
-{
-  return (((y & 1) * x) ^
-       ((y>>1 & 1) * xtime(x)) ^
-       ((y>>2 & 1) * xtime(xtime(x))) ^
-       ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^
-       ((y>>4 & 1) * xtime(xtime(xtime(xtime(x))))));
-  }
-#else
-#define Multiply(x, y)                                \
-      (  ((y & 1) * x) ^                              \
-      ((y>>1 & 1) * xtime(x)) ^                       \
-      ((y>>2 & 1) * xtime(xtime(x))) ^                \
-      ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^         \
-      ((y>>4 & 1) * xtime(xtime(xtime(xtime(x))))))   \
-
-#endif
-
-// MixColumns function mixes the columns of the state matrix.
-// The method used to multiply may be difficult to understand for the inexperienced.
-// Please use the references to gain more information.
-static void InvMixColumns(void)
-{
-  int i;
-  uint8_t a, b, c, d;
-  for (i = 0; i < 4; ++i)
-  { 
-    a = (*state)[i][0];
-    b = (*state)[i][1];
-    c = (*state)[i][2];
-    d = (*state)[i][3];
-
-    (*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^ Multiply(d, 0x09);
-    (*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^ Multiply(d, 0x0d);
-    (*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^ Multiply(d, 0x0b);
-    (*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0e);
-  }
-}
-
-
-// The SubBytes Function Substitutes the values in the
-// state matrix with values in an S-box.
-static void InvSubBytes(void)
-{
-  uint8_t i,j;
-  for (i = 0; i < 4; ++i)
-  {
-    for (j = 0; j < 4; ++j)
-    {
-      (*state)[j][i] = getSBoxInvert((*state)[j][i]);
-    }
-  }
-}
-
-static void InvShiftRows(void)
-{
-  uint8_t temp;
-
-  // Rotate first row 1 columns to right  
-  temp = (*state)[3][1];
-  (*state)[3][1] = (*state)[2][1];
-  (*state)[2][1] = (*state)[1][1];
-  (*state)[1][1] = (*state)[0][1];
-  (*state)[0][1] = temp;
-
-  // Rotate second row 2 columns to right 
-  temp = (*state)[0][2];
-  (*state)[0][2] = (*state)[2][2];
-  (*state)[2][2] = temp;
-
-  temp = (*state)[1][2];
-  (*state)[1][2] = (*state)[3][2];
-  (*state)[3][2] = temp;
-
-  // Rotate third row 3 columns to right
-  temp = (*state)[0][3];
-  (*state)[0][3] = (*state)[1][3];
-  (*state)[1][3] = (*state)[2][3];
-  (*state)[2][3] = (*state)[3][3];
-  (*state)[3][3] = temp;
-}
-
-
-// Cipher is the main function that encrypts the PlainText.
-static void Cipher(void)
-{
-  uint8_t round = 0;
-
-  // Add the First round key to the state before starting the rounds.
-  AddRoundKey(0); 
-  
-  // There will be Nr rounds.
-  // The first Nr-1 rounds are identical.
-  // These Nr-1 rounds are executed in the loop below.
-  for (round = 1; round < Nr; ++round)
-  {
-    SubBytes();
-    ShiftRows();
-    MixColumns();
-    AddRoundKey(round);
-  }
-  
-  // The last round is given below.
-  // The MixColumns function is not here in the last round.
-  SubBytes();
-  ShiftRows();
-  AddRoundKey(Nr);
-}
-
-static void InvCipher(void)
-{
-  uint8_t round=0;
-
-  // Add the First round key to the state before starting the rounds.
-  AddRoundKey(Nr); 
-
-  // There will be Nr rounds.
-  // The first Nr-1 rounds are identical.
-  // These Nr-1 rounds are executed in the loop below.
-  for (round = (Nr - 1); round > 0; --round)
-  {
-    InvShiftRows();
-    InvSubBytes();
-    AddRoundKey(round);
-    InvMixColumns();
-  }
-  
-  // The last round is given below.
-  // The MixColumns function is not here in the last round.
-  InvShiftRows();
-  InvSubBytes();
-  AddRoundKey(0);
-}
-
-
-/*****************************************************************************/
-/* Public functions:                                                         */
-/*****************************************************************************/
-#if defined(ECB) && (ECB == 1)
-
-
-void AES_ECB_encrypt0(const uint8_t* input, const uint8_t* key, uint8_t* output, const uint32_t length)
-{
-  // Copy input to output, and work in-memory on output
-  memcpy(output, input, length);
-  state = (state_t*)output;
-
-  Key = key;
-  KeyExpansion();
-
-  // The next function call encrypts the PlainText with the Key using AES algorithm.
-  Cipher();
-}
-
-void AES_ECB_decrypt0(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length)
-{
-  // Copy input to output, and work in-memory on output
-  memcpy(output, input, length);
-  state = (state_t*)output;
-
-  // The KeyExpansion routine must be called before encryption.
-  Key = key;
-  KeyExpansion();
-
-  InvCipher();
-}
-
-
-#endif // #if defined(ECB) && (ECB == 1)
-
-
-
-
-
-#if defined(CBC) && (CBC == 1)
-
-
-static void XorWithIv(uint8_t* buf)
-{
-  uint8_t i;
-  for (i = 0; i < BLOCKLEN; ++i) //WAS for(i = 0; i < KEYLEN; ++i) but the block in AES is always 128bit so 16 bytes!
-  {
-    buf[i] ^= Iv[i];
-  }
-}
-
-void AES_CBC_encrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv)
-{
-  uintptr_t i;
-  uint8_t extra = length % BLOCKLEN; /* Remaining bytes in the last non-full block */
-
-  // Skip the key expansion if key is passed as 0
-  if (0 != key)
-  {
-    Key = key;
-    KeyExpansion();
-  }
-
-  if (iv != 0)
-  {
-    Iv = (uint8_t*)iv;
-  }
-
-  for (i = 0; i < length; i += BLOCKLEN)
-  {
-    XorWithIv(input);
-    memcpy(output, input, BLOCKLEN);
-    state = (state_t*)output;
-    Cipher();
-    Iv = output;
-    input += BLOCKLEN;
-    output += BLOCKLEN;
-    //printf("Step %d - %d", i/16, i);
-  }
-
-  if (extra)
-  {
-    memcpy(output, input, extra);
-    state = (state_t*)output;
-    Cipher();
-  }
-}
-
-void AES_CBC_decrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv)
-{
-  uintptr_t i;
-  uint8_t extra = length % BLOCKLEN; /* Remaining bytes in the last non-full block */
-
-  // Skip the key expansion if key is passed as 0
-  if (0 != key)
-  {
-    Key = key;
-    KeyExpansion();
-  }
-
-  // If iv is passed as 0, we continue to encrypt without re-setting the Iv
-  if (iv != 0)
-  {
-    Iv = (uint8_t*)iv;
-  }
-
-  for (i = 0; i < length; i += BLOCKLEN)
-  {
-    memcpy(output, input, BLOCKLEN);
-    state = (state_t*)output;
-    InvCipher();
-    XorWithIv(output);
-    Iv = input;
-    input += BLOCKLEN;
-    output += BLOCKLEN;
-  }
-
-  if (extra)
-  {
-    memcpy(output, input, extra);
-    state = (state_t*)output;
-    InvCipher();
-  }
-}
-
-#endif // #if defined(CBC) && (CBC == 1)
diff --git a/lib/aes_acc/aes0.h b/lib/aes_acc/aes0.h
deleted file mode 100755
index cc693fc..0000000
--- a/lib/aes_acc/aes0.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  this file comes from https://github.com/kokke/tiny-AES128-C
- */
-
-#ifndef _AES_H_
-#define _AES_H_
-
-#include <stdint.h>
-
-
-// #define the macros below to 1/0 to enable/disable the mode of operation.
-//
-// CBC enables AES encryption in CBC-mode of operation.
-// ECB enables the basic ECB 16-byte block algorithm. Both can be enabled simultaneously.
-
-// The #ifndef-guard allows it to be configured before #include'ing or at compile time.
-#ifndef CBC
-  #define CBC 1
-#endif
-
-#ifndef ECB
-  #define ECB 1
-#endif
-
-#define AES128 1
-//#define AES192 1
-//#define AES256 1
-
-#if defined(ECB) && (ECB == 1)
-
-void AES_ECB_encrypt0(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length);
-void AES_ECB_decrypt0(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length);
-
-#endif // #if defined(ECB) && (ECB == !)
-
-
-#if defined(CBC) && (CBC == 1)
-
-void AES_CBC_encrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv);
-void AES_CBC_decrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv);
-
-#endif // #if defined(CBC) && (CBC == 1)
-
-
-#endif //_AES_H_
diff --git a/lib/aes_acc/aesacc.c b/lib/aes_acc/aesacc.c
index 73c76ab..2d11896 100644
--- a/lib/aes_acc/aesacc.c
+++ b/lib/aes_acc/aesacc.c
@@ -2,27 +2,25 @@
  * This file is adapted from PolarSSL 1.3.19 (GPL)
  */
 
-#include "aes0.h"
 #include "aesni.h"
 #include "aesarm.h"
-#include "aesacc.h"
-
+#include <stdint.h>
 #include <string.h>
 
 #if defined(AES256) && (AES256 == 1)
 #define AES_KEYSIZE 256
 #ifdef HAVE_AMD64
-  #define aes_setkey_enc aesni_setkey_enc_256
+  #define aeshw_setkey_enc aesni_setkey_enc_256
 #endif
 #elif defined(AES192) && (AES192 == 1)
 #define AES_KEYSIZE 192
 #ifdef HAVE_AMD64
-  #define aes_setkey_enc aesni_setkey_enc_192
+  #define aeshw_setkey_enc aesni_setkey_enc_192
 #endif
 #else
 #define AES_KEYSIZE 128
 #ifdef HAVE_AMD64
-  #define aes_setkey_enc aesni_setkey_enc_128
+  #define aeshw_setkey_enc aesni_setkey_enc_128
 #endif
 #endif
 
@@ -31,15 +29,15 @@
 
 #ifdef HAVE_AMD64
 #define HAVE_HARDAES 1
-#define aes_supported aesni_supported
-#define aes_crypt_ecb aesni_crypt_ecb
-#define aes_inverse_key(a,b) aesni_inverse_key(a,b,AES_NR)
+#define aeshw_supported aesni_supported
+#define aeshw_crypt_ecb aesni_crypt_ecb
+#define aeshw_inverse_key(a,b) aesni_inverse_key(a,b,AES_NR)
 #endif /* HAVE_AMD64 */
 
 #ifdef HAVE_ARM64
 #define HAVE_HARDAES 1
-#define aes_supported aesarm_supported
-#define aes_crypt_ecb aesarm_crypt_ecb
+#define aeshw_supported aesarm_supported
+#define aeshw_crypt_ecb aesarm_crypt_ecb
 
 #include "aesarm_table.h"
 
@@ -53,7 +51,7 @@
 }
 #endif
 
-static void aes_setkey_enc(uint8_t *rk, const uint8_t *key)
+static void aeshw_setkey_enc(uint8_t *rk, const uint8_t *key)
 {
     unsigned int i;
     uint32_t *RK;
@@ -129,7 +127,7 @@ static void aes_setkey_enc(uint8_t *rk, const uint8_t *key)
     }
 }
 
-static void aes_inverse_key(uint8_t *invkey, const uint8_t *fwdkey)
+static void aeshw_inverse_key(uint8_t *invkey, const uint8_t *fwdkey)
 {
   int i, j;
   uint32_t *RK;
@@ -159,18 +157,32 @@ static void aes_inverse_key(uint8_t *invkey, const uint8_t *fwdkey)
   *RK++ = *SK++;
   *RK++ = *SK++;
 }
-
 #endif /* HAVE_ARM64 */
 
-#ifdef HAVE_ASM
+#ifdef HAVE_HARDAES
+static void aeshw_setkey_dec(uint8_t *rk, const uint8_t *key)
+{
+  uint8_t rk_tmp[AES_RKSIZE];
+  aeshw_setkey_enc(rk_tmp, key);
+  aeshw_inverse_key(rk, rk_tmp);
+}
+#endif /* HAVE_HARDAES */
 
+/* OpenSSL assembly functions */
 #define AES_MAXNR 14
-
 typedef struct {
   uint32_t rd_key[4 * (AES_MAXNR + 1)];
-  int rounds;
+  uint32_t rounds;
 } AES_KEY;
 
+#if defined(__amd64__) || defined(__x86_64__) || \
+    defined(__aarch64__)
+#define AES_set_encrypt_key vpaes_set_encrypt_key
+#define AES_set_decrypt_key vpaes_set_decrypt_key
+#define AES_encrypt vpaes_encrypt
+#define AES_decrypt vpaes_decrypt
+#endif /* VPAES for 64-bit Intel and ARM */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -189,69 +201,51 @@ void AES_decrypt(const unsigned char *in, unsigned char *out,
 }
 #endif
 
-static int aes_supported(void)
-{
-  return 2;
-}
-
 static void aes_crypt_ecb( int nr,
                            unsigned char *rk,
                            int mode,
                            const unsigned char input[16],
                            unsigned char output[16] )
 {
-  AES_KEY *ctx;
-  ctx = (AES_KEY *) rk;
-  ctx->rounds = nr;
   if (mode == AES_DECRYPT) {
-    AES_decrypt(input, output, ctx);
+    AES_decrypt(input, output, (AES_KEY *) rk);
   } else {
-    AES_encrypt(input, output, ctx);
+    AES_encrypt(input, output, (AES_KEY *) rk);
   }
 }
 
 static void aes_setkey_enc(uint8_t *rk, const uint8_t *key)
 {
-  AES_KEY *ctx;
-  ctx = (AES_KEY *) rk;
-  ctx->rounds = AES_NR;
-  AES_set_encrypt_key(key, AES_KEYSIZE, ctx);
+  AES_set_encrypt_key(key, AES_KEYSIZE, (AES_KEY *) rk);
 }
 
 static void aes_setkey_dec(uint8_t *rk, const uint8_t *key)
 {
-  AES_KEY *ctx;
-  ctx = (AES_KEY *) rk;
-  ctx->rounds = AES_NR;
-  AES_set_decrypt_key(key, AES_KEYSIZE, ctx);
+  AES_set_decrypt_key(key, AES_KEYSIZE, (AES_KEY *) rk);
 }
 
-#endif
+static void (*crypt_ecb) ( int nr,
+                           unsigned char *rk,
+                           int mode,
+                           const unsigned char input[16],
+                           unsigned char output[16] )
+  = aes_crypt_ecb;
 
-#ifdef HAVE_HARDAES
+static void (*setkey_enc) (uint8_t *rk, const uint8_t *key)
+  = aes_setkey_enc;
 
-static void aes_setkey_dec(uint8_t *rk, const uint8_t *key)
-{
-  uint8_t rk_tmp[AES_RKSIZE];
-  aes_setkey_enc(rk_tmp, key);
-  aes_inverse_key(rk, rk_tmp);
-}
-
-#endif
-
-#if defined(HAVE_HARDAES) || defined(HAVE_ASM)
-
-#define HAVE_ACC 1
+static void (*setkey_dec) (uint8_t *rk, const uint8_t *key)
+  = aes_setkey_dec;
 
 /*
  * AESNI-CBC buffer encryption/decryption
  */
-static void aes_crypt_cbc( int mode,
-                           uint8_t* rk,
-                           uint32_t length,
-                           uint8_t iv[16],
-                           const uint8_t *input,
-                           uint8_t *output )
+static void crypt_cbc( int mode,
+                       uint8_t* rk,
+                       uint32_t length,
+                       uint8_t iv[16],
+                       const uint8_t *input,
+                       uint8_t *output )
 {
     int i;
     uint8_t temp[16];
@@ -261,7 +255,7 @@ static void aes_crypt_cbc( int mode,
         while( length > 0 )
         {
             memcpy( temp, input, 16 );
-            aes_crypt_ecb( AES_NR, rk, mode, input, output );
+            crypt_ecb( AES_NR, rk, mode, input, output );
 
             for( i = 0; i < 16; i++ )
                 output[i] = (uint8_t)( output[i] ^ iv[i] );
@@ -280,7 +274,7 @@ static void aes_crypt_cbc( int mode,
             for( i = 0; i < 16; i++ )
                 output[i] = (uint8_t)( input[i] ^ iv[i] );
 
-            aes_crypt_ecb( AES_NR, rk, mode, output, output );
+            crypt_ecb( AES_NR, rk, mode, output, output );
             memcpy( iv, output, 16 );
 
             input  += 16;
@@ -290,12 +284,26 @@ static void aes_crypt_cbc( int mode,
     }
 }
 
-#endif /* HAVE_HARDAES or HAVE_ASM */
 
-int AESACC_supported(void)
+static void aeshw_init(void)
 {
-#if defined(HAVE_ACC)
-  return aes_supported();
+#ifdef HAVE_HARDAES
+  static int done = 0;
+  if (!done) {
+    if (aeshw_supported()) {
+      crypt_ecb = aeshw_crypt_ecb;
+      setkey_enc = aeshw_setkey_enc;
+      setkey_dec = aeshw_setkey_dec;
+    }
+    done = 1;
+  }
+#endif
+}
+
+int AES_support_hwaccel(void)
+{
+#ifdef HAVE_HARDAES
+  return aeshw_supported();
 #else
   return 0;
 #endif
@@ -303,86 +311,59 @@ int AESACC_supported(void)
 
 void AES_CBC_encrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv)
 {
-#if defined(HAVE_ACC)
   uint8_t iv_tmp[16];
   uint8_t rk[AES_RKSIZE];
 
-  if (aes_supported())
+  if (key == NULL || iv == NULL)
   {
-    if (key == NULL || iv == NULL)
-    {
-      return;
-    }
-    memcpy(iv_tmp, iv, 16);
-    aes_setkey_enc(rk, key);
-    aes_crypt_cbc(AES_ENCRYPT, rk, \
-                  length, iv_tmp, input, output);
     return;
   }
-#endif
-
-  AES_CBC_encrypt_buffer0(output, input, length, key, iv);
+  aeshw_init();
+  memcpy(iv_tmp, iv, 16);
+  setkey_enc(rk, key);
+  crypt_cbc(AES_ENCRYPT, rk, \
+            length, iv_tmp, input, output);
 }
 
 void AES_CBC_decrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv)
 {
-#if defined(HAVE_ACC)
   uint8_t iv_tmp[16];
   uint8_t rk[AES_RKSIZE];
 
-  if (aes_supported())
+  if (key == NULL || iv == NULL)
   {
-    if (key == NULL || iv == NULL)
-    {
-      return;
-    }
-    memcpy(iv_tmp, iv, 16);
-    aes_setkey_dec(rk, key);
-    aes_crypt_cbc(AES_DECRYPT, rk, \
-                  length, iv_tmp, input, output);
     return;
   }
-#endif
+  aeshw_init();
+  memcpy(iv_tmp, iv, 16);
+  setkey_dec(rk, key);
+  crypt_cbc(AES_DECRYPT, rk, \
+            length, iv_tmp, input, output);
 
-  AES_CBC_decrypt_buffer0(output, input, length, key, iv);
 }
 
 void AES_ECB_encrypt(const uint8_t* input, const uint8_t* key, uint8_t* output, const uint32_t length)
 {
-#if defined(HAVE_ACC)
   uint8_t rk[AES_RKSIZE];
 
-  if (aes_supported())
+  if (key == NULL)
   {
-    if (key == NULL)
-    {
-      return;
-    }
-    aes_setkey_enc(rk, key);
-    aes_crypt_ecb(AES_NR, rk, AES_ENCRYPT, input, output);
     return;
   }
-#endif
-
-  AES_ECB_encrypt0(input, key, output, length);
+  aeshw_init();
+  setkey_enc(rk, key);
+  crypt_ecb(AES_NR, rk, AES_ENCRYPT, input, output);
 }
 
 void AES_ECB_decrypt(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length)
 {
-#if defined(HAVE_ACC)
   uint8_t rk[AES_RKSIZE];
 
-  if (aes_supported())
+  if (key == NULL)
   {
-    if (key == NULL)
-    {
-      return;
-    }
-    aes_setkey_dec(rk, key);
-    aes_crypt_ecb(AES_NR, rk, AES_DECRYPT, input, output);
     return;
   }
-#endif
-
-  AES_ECB_decrypt0(input, key, output, length);
+  aeshw_init();
+  setkey_dec(rk, key);
+  crypt_ecb(AES_NR, rk, AES_DECRYPT, input, output);
 }
diff --git a/lib/aes_acc/aesacc.h b/lib/aes_acc/aesacc.h
deleted file mode 100644
index afc36d0..0000000
--- a/lib/aes_acc/aesacc.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _AESACC_H_
-#define _AESACC_H_
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int AESACC_supported(void);
-void AESACC_ECB_encrypt(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length);
-void AESACC_ECB_decrypt(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length);
-void AESACC_CBC_encrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv);
-void AESACC_CBC_decrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _AESACC_H_ */
diff --git a/lib/aes_acc/aesni.c b/lib/aes_acc/aesni.c
index 66e6cb8..85aa22b 100644
--- a/lib/aes_acc/aesni.c
+++ b/lib/aes_acc/aesni.c
@@ -86,11 +86,11 @@ int aesni_supported( void )
 /*
  * AES-NI AES-ECB block en(de)cryption
  */
-int aesni_crypt_ecb( int nr,
-                     unsigned char *rk,
-                     int mode,
-                     const unsigned char input[16],
-                     unsigned char output[16] )
+void aesni_crypt_ecb( int nr,
+                      unsigned char *rk,
+                      int mode,
+                      const unsigned char input[16],
+                      unsigned char output[16] )
 {
     asm( "movdqu    (%3), %%xmm0    \n\t" // load input
          "movdqu    (%1), %%xmm1    \n\t" // load round key 0
@@ -124,9 +124,6 @@ int aesni_crypt_ecb( int nr,
          :
          : "r" (nr), "r" (rk), "r" (mode), "r" (input), "r" (output)
          : "memory", "cc", "xmm0", "xmm1" );
-
-
-    return( 0 );
 }
 
 /*
diff --git a/lib/aes_acc/aesni.h b/lib/aes_acc/aesni.h
index 42b6aa1..b106b9c 100644
--- a/lib/aes_acc/aesni.h
+++ b/lib/aes_acc/aesni.h
@@ -64,14 +64,12 @@ int aesni_supported( void );
  * \param mode     AES_ENCRYPT or AES_DECRYPT
  * \param input    16-byte input block
  * \param output   16-byte output block
- *
- * \return         0 on success (cannot fail)
  */
-int aesni_crypt_ecb( int nr,
-                     unsigned char *rk,
-                     int mode,
-                     const unsigned char input[16],
-                     unsigned char output[16] );
+void aesni_crypt_ecb( int nr,
+                      unsigned char *rk,
+                      int mode,
+                      const unsigned char input[16],
+                      unsigned char output[16] );
 
 /**
  * \brief           Compute decryption round keys from encryption round keys
diff --git a/lib/aes_acc/asm/arm64.S b/lib/aes_acc/asm/arm64.S
new file mode 100644
index 0000000..805b78c
--- /dev/null
+++ b/lib/aes_acc/asm/arm64.S
@@ -0,0 +1,1178 @@
+.text
+
+.type	_vpaes_consts,%object
+.align	7	// totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:	//	mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:	//	mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:	//	sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv:	//	inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:	//	input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:	//	sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:	//	sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:	//	sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Decryption stuff
+//
+.Lk_dipt:	//	decryption input transform
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo:	//	decryption sbox final output
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9:	//	decryption sbox output *9*u, *9*t
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:	//	decryption sbox output *D*u, *D*t
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:	//	decryption sbox output *B*u, *B*t
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:	//	decryption sbox output *E*u, *E*t
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+//  Key schedule constants
+//
+.Lk_dksd:	//	decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	//	decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	//	decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:	//	rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:	//	output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_encrypt_preheat,%function
+.align	4
+_vpaes_encrypt_preheat:
+	adr	x10, .Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
+	ret
+.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,%function
+.align	4
+_vpaes_encrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adr	x11, .Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Lenc_entry
+
+.align	4
+.Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+.Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl	vpaes_encrypt
+.type	vpaes_encrypt,%function
+.align	4
+vpaes_encrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_encrypt_preheat
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.type	_vpaes_encrypt_2x,%function
+.align	4
+_vpaes_encrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adr	x11, .Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b,  v17.16b
+	ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	tbl	v9.16b,  {v20.16b}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	tbl	v10.16b, {v21.16b}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,   v10.16b
+	b	.Lenc_2x_entry
+
+.align	4
+.Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	tbl	v12.16b, {v25.16b}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v24.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	tbl	v13.16b, {v27.16b}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	tbl	v10.16b, {v26.16b}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+.Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b, v17.16b
+	ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	tbl	v13.16b, {v19.16b},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v23.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+
+.type	_vpaes_decrypt_preheat,%function
+.align	4
+_vpaes_decrypt_preheat:
+	adr	x10, .Lk_inv
+	movi	v17.16b, #0x0f
+	adr	x11, .Lk_dipt
+	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
+	ret
+.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type	_vpaes_decrypt_core,%function
+.align	4
+_vpaes_decrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adr	x10, .Lk_sr
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adr	x10, .Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Ldec_entry
+
+.align	4
+.Ldec_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+.Ldec_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, .Ldec_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl	vpaes_decrypt
+.type	vpaes_decrypt,%function
+.align	4
+vpaes_decrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_decrypt_preheat
+	bl	_vpaes_decrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_decrypt,.-vpaes_decrypt
+
+// v14-v15 input, v0-v1 output
+.type	_vpaes_decrypt_2x,%function
+.align	4
+_vpaes_decrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adr	x10, .Lk_sr
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adr	x10, .Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b, v17.16b
+	ushr	v8.16b,  v15.16b, #4
+	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	tbl	v10.16b, {v20.16b},v9.16b
+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	tbl	v8.16b,  {v21.16b},v8.16b
+	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v10.16b, v10.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,  v10.16b
+	b	.Ldec_2x_entry
+
+.align	4
+.Ldec_2x_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v12.16b, {v24.16b}, v10.16b
+	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	tbl	v9.16b,  {v25.16b}, v11.16b
+	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
+	eor	v8.16b,  v12.16b, v16.16b
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v12.16b, {v26.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	tbl	v9.16b,  {v27.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v12.16b, {v28.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	tbl	v9.16b,  {v29.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v12.16b, {v30.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	tbl	v9.16b,  {v31.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+.Ldec_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b,  v17.16b
+	ushr	v8.16b,  v8.16b,  #4
+	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	tbl	v10.16b, {v19.16b},v9.16b
+	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,	 v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v10.16b
+	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v10.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, .Ldec_2x_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	tbl	v9.16b,  {v23.16b}, v11.16b
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	eor	v8.16b,  v9.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v2.16b
+	ret
+.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adr	x10, .Lk_inv
+	movi	v16.16b, #0x5b			// .Lk_s63
+	adr	x11, .Lk_sb1
+	movi	v17.16b, #0x0f			// .Lk_s0F
+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
+	adr	x10, .Lk_dksd
+	ld1	{v22.2d,v23.2d}, [x11]		// .Lk_sb1
+	adr	x11, .Lk_mc_forward
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
+	ld1	{v8.2d}, [x10]			// .Lk_rcon
+	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
+	ret
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adr	x10, .Lk_sr			// lea	.Lk_sr(%rip),%r10
+	add	x8, x8, x10
+	cbnz	w3, .Lschedule_am_decrypting
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
+	b	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	// decrypting, output zeroth round key after shiftrows
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	eor	x8, x8, #0x30			// xor	$0x30, %r8
+
+.Lschedule_go:
+	cmp	w1, #192			// cmp	$192,	%esi
+	b.hi	.Lschedule_256
+	b.eq	.Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	x0, #10			// mov	$10, %esi
+
+.Loop_schedule_128:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+.Lschedule_192:
+	sub	x0, x0, #8
+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	x0, #4			// mov	$4,	%esi
+
+.Loop_schedule_192:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+.Lschedule_256:
+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	x0, #7			// mov	$7, %esi
+
+.Loop_schedule_256:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+.Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adr	x11, .Lk_deskew			// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	cbnz	w3, .Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adr	x11, .Lk_opt			// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x2, x2, #32			// add	$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+.Lschedule_mangle_last_dec:
+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
+	sub	x2, x2, #16			// add	$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,%function
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	cbnz	w3, .Lschedule_mangle_dec
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	x2, x2, #16			// add	$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+	b	.Lschedule_mangle_both
+.align	4
+.Lschedule_mangle_dec:
+	// inverse mix columns
+						// lea	.Lk_dksd(%rip),%r11
+	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
+	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+						// vmovdqa	0x00(%r11),	%xmm2
+	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+						// vmovdqa	0x10(%r11),	%xmm3
+	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x20(%r11),	%xmm2
+	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x30(%r11),	%xmm3
+	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x40(%r11),	%xmm2
+	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x50(%r11),	%xmm3
+	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+
+						// vmovdqa	0x60(%r11),	%xmm2
+	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+						// vmovdqa	0x70(%r11),	%xmm4
+	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	x2, x2, #16			// add	$-16,	%rdx
+
+.Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #64-16			// add	$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	w3, #0		// mov	$0,%ecx
+	mov	x8, #0x30		// mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,%function
+.align	4
+vpaes_set_decrypt_key:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	w9, w9, #4		// shl	$4,%eax
+	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
+	add	x2, x2, x9
+
+	mov	w3, #1		// mov	$1,%ecx
+	lsr	w8, w1, #1		// shr	$1,%r8d
+	and	x8, x8, #32		// and	$32,%r8d
+	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+.globl	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,%function
+.align	4
+vpaes_cbc_encrypt:
+	cbz	x2, .Lcbc_abort
+	cmp	w5, #0			// check direction
+	b.eq	vpaes_cbc_decrypt
+
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+
+	ld1	{v0.16b}, [x4]	// load ivec
+	bl	_vpaes_encrypt_preheat
+	b	.Lcbc_enc_loop
+
+.align	4
+.Lcbc_enc_loop:
+	ld1	{v7.16b}, [x0],#16	// load input
+	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1],#16	// save output
+	subs	x17, x17, #16
+	b.hi	.Lcbc_enc_loop
+
+	st1	{v0.16b}, [x4]	// write ivec
+
+	ldp	x29,x30,[sp],#16
+.Lcbc_abort:
+	ret
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+.type	vpaes_cbc_decrypt,%function
+.align	4
+vpaes_cbc_decrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+	ld1	{v6.16b}, [x4]	// load ivec
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	.Lcbc_dec_loop2x
+
+	ld1	{v7.16b}, [x0], #16	// load input
+	bl	_vpaes_decrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	orr	v6.16b, v7.16b, v7.16b	// next ivec value
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #16
+	b.ls	.Lcbc_dec_done
+
+.align	4
+.Lcbc_dec_loop2x:
+	ld1	{v14.16b,v15.16b}, [x0], #32
+	bl	_vpaes_decrypt_2x
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	eor	v1.16b, v1.16b, v14.16b
+	orr	v6.16b, v15.16b, v15.16b
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #32
+	b.hi	.Lcbc_dec_loop2x
+
+.Lcbc_dec_done:
+	st1	{v6.16b}, [x4]
+
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
+.globl	vpaes_ecb_encrypt
+.type	vpaes_ecb_encrypt,%function
+.align	4
+vpaes_ecb_encrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, x2
+	mov	x2,  x3
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #16
+	b.eq	.Lecb_enc_loop
+
+	ld1	{v7.16b}, [x0],#16
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1],#16
+	subs	x17, x17, #16
+	b.ls	.Lecb_enc_done
+
+.align	4
+.Lecb_enc_loop:
+	ld1	{v14.16b,v15.16b}, [x0], #32
+	bl	_vpaes_encrypt_2x
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #32
+	b.hi	.Lecb_enc_loop
+
+.Lecb_enc_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
+
+.globl	vpaes_ecb_decrypt
+.type	vpaes_ecb_decrypt,%function
+.align	4
+vpaes_ecb_decrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, x2
+	mov	x2,  x3
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	.Lecb_dec_loop
+
+	ld1	{v7.16b}, [x0],#16
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1],#16
+	subs	x17, x17, #16
+	b.ls	.Lecb_dec_done
+
+.align	4
+.Lecb_dec_loop:
+	ld1	{v14.16b,v15.16b}, [x0], #32
+	bl	_vpaes_decrypt_2x
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #32
+	b.hi	.Lecb_dec_loop
+
+.Lecb_dec_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
diff --git a/lib/aes_acc/asm/x64.S b/lib/aes_acc/asm/x64.S
new file mode 100644
index 0000000..d193298
--- /dev/null
+++ b/lib/aes_acc/asm/x64.S
@@ -0,0 +1,827 @@
+.text	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_ipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%r9),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align	16
+.Lenc_loop:
+
+	movdqa	%xmm13,%xmm4
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm15,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%r11,%r10,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	(%r11,%r10,1),%xmm4
+	movdqa	%xmm14,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+
+.Lenc_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm11,%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm10,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	.Lenc_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%r11,%r10,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	.byte	0xf3,0xc3
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+	movq	%rdx,%r9
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_dipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movq	%rax,%r11
+	psrld	$4,%xmm1
+	movdqu	(%r9),%xmm5
+	shlq	$4,%r11
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_dipt+16(%rip),%xmm0
+	xorq	$0x30,%r11
+	leaq	.Lk_dsbd(%rip),%r10
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	pxor	%xmm5,%xmm2
+	movdqa	.Lk_mc_forward+48(%rip),%xmm5
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	addq	%r10,%r11
+	jmp	.Ldec_entry
+
+.align	16
+.Ldec_loop:
+
+
+
+	movdqa	-32(%r10),%xmm4
+	movdqa	-16(%r10),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	0(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	32(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	64(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	addq	$16,%r9
+.byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subq	$1,%rax
+
+.Ldec_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	movdqa	%xmm11,%xmm2
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm0
+	pxor	%xmm1,%xmm3
+	jnz	.Ldec_loop
+
+
+	movdqa	96(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%r10),%xmm0
+	movdqa	-352(%r11),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	.byte	0xf3,0xc3
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+
+
+
+
+
+	call	_vpaes_preheat
+	movdqa	.Lk_rcon(%rip),%xmm8
+	movdqu	(%rdi),%xmm0
+
+
+	movdqa	%xmm0,%xmm3
+	leaq	.Lk_ipt(%rip),%r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+
+	leaq	.Lk_sr(%rip),%r10
+	testq	%rcx,%rcx
+	jnz	.Lschedule_am_decrypting
+
+
+	movdqu	%xmm0,(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%rdx)
+	xorq	$0x30,%r8
+
+.Lschedule_go:
+	cmpl	$192,%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+	movl	$10,%esi
+
+.Loop_schedule_128:
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	.Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%esi
+
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+
+
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm6,%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,%xmm7
+
+	jmp	.Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_mangle_last:
+
+	leaq	.Lk_deskew(%rip),%r11
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,193
+	leaq	.Lk_opt(%rip),%r11
+	addq	$32,%rdx
+
+.Lschedule_mangle_last_dec:
+	addq	$-16,%rdx
+	pxor	.Lk_s63(%rip),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%rdx)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	$0x80,%xmm6,%xmm1
+	pshufd	$0xFE,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	movhlps	%xmm1,%xmm6
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+
+	pxor	%xmm1,%xmm1
+.byte	102,65,15,58,15,200,15
+.byte	102,69,15,58,15,192,15
+	pxor	%xmm1,%xmm7
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	.Lk_s63(%rip),%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	(%r11),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%r11),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_dec
+
+
+	addq	$16,%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+
+	leaq	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm4
+
+	movdqa	0(%r11),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	32(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	64(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	96(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+
+	addq	$-16,%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	addq	$-16,%r8
+	andq	$0x30,%r8
+	movdqu	%xmm3,(%rdx)
+	.byte	0xf3,0xc3
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+
+	movl	$0,%ecx
+	movl	$0x30,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+	shll	$4,%eax
+	leaq	16(%rdx,%rax,1),%rdx
+
+	movl	$1,%ecx
+	movl	%esi,%r8d
+	shrl	$1,%r8d
+	andl	$32,%r8d
+	xorl	$32,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl	vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.globl	vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.size	vpaes_decrypt,.-vpaes_decrypt
+.globl	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+	xchgq	%rcx,%rdx
+	subq	$16,%rcx
+	jc	.Lcbc_abort
+	movdqu	(%r8),%xmm6
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	cmpl	$0,%r9d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	(%rdi),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	(%rdi),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,(%r8)
+.Lcbc_abort:
+	.byte	0xf3,0xc3
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+
+
+
+
+
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+	leaq	.Lk_s0F(%rip),%r10
+	movdqa	-32(%r10),%xmm10
+	movdqa	-16(%r10),%xmm11
+	movdqa	0(%r10),%xmm9
+	movdqa	48(%r10),%xmm13
+	movdqa	64(%r10),%xmm12
+	movdqa	80(%r10),%xmm15
+	movdqa	96(%r10),%xmm14
+	.byte	0xf3,0xc3
+.size	_vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type	_vpaes_consts,@object
+.align	64
+_vpaes_consts:
+.Lk_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
diff --git a/makefile b/makefile
index 4574c8b..54a80f4 100755
--- a/makefile
+++ b/makefile
@@ -6,7 +6,7 @@ cc_arm=/home/wangyu/Desktop/arm-2014.05/bin/arm-none-linux-gnueabi-g++
 FLAGS= -std=c++11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers
 
 SOURCES=main.cpp lib/aes.c lib/md5.c encrypt.cpp log.cpp network.cpp common.cpp
-SOURCES_AES_ACC=main.cpp $(wildcard lib/aes_acc/aes*.c) lib/md5.c encrypt.cpp log.cpp network.cpp common.cpp
+SOURCES_AES_ACC=$(filter-out lib/aes.c,$(SOURCES)) $(wildcard lib/aes_acc/aes*.c)
 
 NAME=udp2raw
 TAR=${NAME}_binaries.tar.gz ${NAME}_amd64  ${NAME}_x86  ${NAME}_x86_asm_aes ${NAME}_ar71xx ${NAME}_bcm2708 ${NAME}_arm ${NAME}_amd64_hw_aes ${NAME}_arm_asm_aes
@@ -32,16 +32,16 @@ bcm2708:
 amd64:
 	${cc_local}   -o ${NAME}_amd64    -I. ${SOURCES} ${FLAGS} -lrt -static -O3
 amd64_hw_aes:
-	${cc_local}   -o ${NAME}_amd64_hw_aes   -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3
+	${cc_local}   -o ${NAME}_amd64_hw_aes   -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 lib/aes_acc/asm/x64.S
 x86:
 	${cc_local}   -o ${NAME}_x86      -I. ${SOURCES} ${FLAGS} -lrt -static -O3 -m32
 x86_asm_aes:
-	${cc_local}   -o ${NAME}_x86_asm_aes    -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 -m32 -DHAVE_ASM lib/aes_acc/asm/x86.S
+	${cc_local}   -o ${NAME}_x86_asm_aes    -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 -m32 lib/aes_acc/asm/x86.S
 arm:
 	${cc_cross}   -o ${NAME}_arm      -I. ${SOURCES} ${FLAGS} -lrt -static -O3
 
 arm_asm_aes:
-	${cc_cross}   -o ${NAME}_arm_asm_aes    -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3  -DHAVE_ASM lib/aes_acc/asm/arm.S
+	${cc_cross}   -o ${NAME}_arm_asm_aes    -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 lib/aes_acc/asm/arm.S
 
 cross:
 	${cc_cross}   -o ${NAME}_cross    -I. ${SOURCES} ${FLAGS} -lrt -O3