From 8d1e735041ed8f5b5d83eead59786e96521ef79a Mon Sep 17 00:00:00 2001 From: Linus Yang Date: Fri, 18 Aug 2017 21:16:49 +0800 Subject: [PATCH 1/2] Add mips big-endian asm --- lib/aes_acc/asm/mips.S | 12 +- lib/aes_acc/asm/mips_be.S | 1831 +++++++++++++++++++++++++++++++++++++ 2 files changed, 1835 insertions(+), 8 deletions(-) create mode 100644 lib/aes_acc/asm/mips_be.S diff --git a/lib/aes_acc/asm/mips.S b/lib/aes_acc/asm/mips.S index 4aa0e4c..97a12b9 100644 --- a/lib/aes_acc/asm/mips.S +++ b/lib/aes_acc/asm/mips.S @@ -507,6 +507,7 @@ AES_encrypt: .frame $29,64,$31 .mask 0xc0ff0000,-4 .set noreorder + .cpload $25 sub $29,64 sw $31,64-1*4($29) sw $30,64-2*4($29) @@ -518,8 +519,6 @@ AES_encrypt: sw $18,64-8*4($29) sw $17,64-9*4($29) sw $16,64-10*4($29) - .cplocal $7 - .cpsetup $25,$0,AES_encrypt .set reorder la $7,AES_Te # PIC-ified 'load address' @@ -1050,6 +1049,7 @@ AES_decrypt: .frame $29,64,$31 .mask 0xc0ff0000,-4 .set noreorder + .cpload $25 sub $29,64 sw $31,64-1*4($29) sw $30,64-2*4($29) @@ -1061,8 +1061,6 @@ AES_decrypt: sw $18,64-8*4($29) sw $17,64-9*4($29) sw $16,64-10*4($29) - .cplocal $7 - .cpsetup $25,$0,AES_decrypt .set reorder la $7,AES_Td # PIC-ified 'load address' @@ -1359,11 +1357,10 @@ AES_set_encrypt_key: .frame $29,32,$31 .mask 0xc0000000,-4 .set noreorder + .cpload $25 sub $29,32 sw $31,32-1*4($29) sw $30,32-2*4($29) - .cplocal $7 - .cpsetup $25,$0,AES_set_encrypt_key .set reorder la $7,AES_Te4 # PIC-ified 'load address' @@ -1383,11 +1380,10 @@ AES_set_decrypt_key: .frame $29,32,$31 .mask 0xc0000000,-4 .set noreorder + .cpload $25 sub $29,32 sw $31,32-1*4($29) sw $30,32-2*4($29) - .cplocal $7 - .cpsetup $25,$0,AES_set_decrypt_key .set reorder la $7,AES_Te4 # PIC-ified 'load address' diff --git a/lib/aes_acc/asm/mips_be.S b/lib/aes_acc/asm/mips_be.S new file mode 100644 index 0000000..6fcab5a --- /dev/null +++ b/lib/aes_acc/asm/mips_be.S @@ -0,0 +1,1831 @@ +.text +#ifdef OPENSSL_FIPSCANISTER +# include +#endif + +#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) +#define _MIPS_ARCH_MIPS32R2 +#endif + +#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__)) +.option pic2 +#endif +.set noat +.align 5 +.ent _mips_AES_encrypt +_mips_AES_encrypt: + .frame $29,0,$31 + .set reorder + lw $12,0($6) + lw $13,4($6) + lw $14,8($6) + lw $15,12($6) + lw $30,240($6) + add $3,$6,16 + + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + + sub $30,1 +#if defined(__mips_smartmips) + ext $1,$9,16,8 +.Loop_enc: + ext $2,$10,16,8 + ext $24,$11,16,8 + ext $25,$8,16,8 + lwxs $12,$1($7) # Te1[s1>>16] + ext $1,$10,8,8 + lwxs $13,$2($7) # Te1[s2>>16] + ext $2,$11,8,8 + lwxs $14,$24($7) # Te1[s3>>16] + ext $24,$8,8,8 + lwxs $15,$25($7) # Te1[s0>>16] + ext $25,$9,8,8 + + lwxs $16,$1($7) # Te2[s2>>8] + ext $1,$11,0,8 + lwxs $17,$2($7) # Te2[s3>>8] + ext $2,$8,0,8 + lwxs $18,$24($7) # Te2[s0>>8] + ext $24,$9,0,8 + lwxs $19,$25($7) # Te2[s1>>8] + ext $25,$10,0,8 + + lwxs $20,$1($7) # Te3[s3] + ext $1,$8,24,8 + lwxs $21,$2($7) # Te3[s0] + ext $2,$9,24,8 + lwxs $22,$24($7) # Te3[s1] + ext $24,$10,24,8 + lwxs $23,$25($7) # Te3[s2] + ext $25,$11,24,8 + + rotr $12,$12,8 + rotr $13,$13,8 + rotr $14,$14,8 + rotr $15,$15,8 + + rotr $16,$16,16 + rotr $17,$17,16 + rotr $18,$18,16 + rotr $19,$19,16 + + xor $12,$16 + lwxs $16,$1($7) # Te0[s0>>24] + xor $13,$17 + lwxs $17,$2($7) # Te0[s1>>24] + xor $14,$18 + lwxs $18,$24($7) # Te0[s2>>24] + xor $15,$19 + lwxs $19,$25($7) # Te0[s3>>24] + + rotr $20,$20,24 + lw $8,0($3) + rotr $21,$21,24 + lw $9,4($3) + rotr $22,$22,24 + lw $10,8($3) + rotr $23,$23,24 + lw $11,12($3) + + xor $12,$20 + xor $13,$21 + xor $14,$22 + xor $15,$23 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + sub $30,1 + add $3,16 + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + .set noreorder + bnez $30,.Loop_enc + ext $1,$9,16,8 + + srl $1,$9,14 +#else + srl $1,$9,14 +.Loop_enc: + srl $2,$10,14 + srl $24,$11,14 + srl $25,$8,14 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + lw $12,0($1) # Te1[s1>>16] + srl $1,$10,6 + lw $13,0($2) # Te1[s2>>16] + srl $2,$11,6 + lw $14,0($24) # Te1[s3>>16] + srl $24,$8,6 + lw $15,0($25) # Te1[s0>>16] + srl $25,$9,6 +#else + lwl $12,3($1) # Te1[s1>>16] + lwl $13,3($2) # Te1[s2>>16] + lwl $14,3($24) # Te1[s3>>16] + lwl $15,3($25) # Te1[s0>>16] + lwr $12,2($1) # Te1[s1>>16] + srl $1,$10,6 + lwr $13,2($2) # Te1[s2>>16] + srl $2,$11,6 + lwr $14,2($24) # Te1[s3>>16] + srl $24,$8,6 + lwr $15,2($25) # Te1[s0>>16] + srl $25,$9,6 +#endif + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + rotr $12,$12,8 + rotr $13,$13,8 + rotr $14,$14,8 + rotr $15,$15,8 +# if defined(_MIPSEL) + lw $16,0($1) # Te2[s2>>8] + sll $1,$11,2 + lw $17,0($2) # Te2[s3>>8] + sll $2,$8,2 + lw $18,0($24) # Te2[s0>>8] + sll $24,$9,2 + lw $19,0($25) # Te2[s1>>8] + sll $25,$10,2 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lw $20,0($1) # Te3[s3] + ins $1,$8,2,8 + lw $21,0($2) # Te3[s0] + ins $2,$9,2,8 + lw $22,0($24) # Te3[s1] + ins $24,$10,2,8 + lw $23,0($25) # Te3[s2] + ins $25,$11,2,8 +# else + lw $16,0($1) # Te2[s2>>8] + ins $1,$11,2,8 + lw $17,0($2) # Te2[s3>>8] + ins $2,$8,2,8 + lw $18,0($24) # Te2[s0>>8] + ins $24,$9,2,8 + lw $19,0($25) # Te2[s1>>8] + ins $25,$10,2,8 + + lw $20,0($1) # Te3[s3] + srl $1,$8,22 + lw $21,0($2) # Te3[s0] + srl $2,$9,22 + lw $22,0($24) # Te3[s1] + srl $24,$10,22 + lw $23,0($25) # Te3[s2] + srl $25,$11,22 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +# endif + rotr $16,$16,16 + rotr $17,$17,16 + rotr $18,$18,16 + rotr $19,$19,16 + + rotr $20,$20,24 + rotr $21,$21,24 + rotr $22,$22,24 + rotr $23,$23,24 +#else + lwl $16,2($1) # Te2[s2>>8] + lwl $17,2($2) # Te2[s3>>8] + lwl $18,2($24) # Te2[s0>>8] + lwl $19,2($25) # Te2[s1>>8] + lwr $16,1($1) # Te2[s2>>8] + sll $1,$11,2 + lwr $17,1($2) # Te2[s3>>8] + sll $2,$8,2 + lwr $18,1($24) # Te2[s0>>8] + sll $24,$9,2 + lwr $19,1($25) # Te2[s1>>8] + sll $25,$10,2 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lwl $20,1($1) # Te3[s3] + lwl $21,1($2) # Te3[s0] + lwl $22,1($24) # Te3[s1] + lwl $23,1($25) # Te3[s2] + lwr $20,0($1) # Te3[s3] + srl $1,$8,22 + lwr $21,0($2) # Te3[s0] + srl $2,$9,22 + lwr $22,0($24) # Te3[s1] + srl $24,$10,22 + lwr $23,0($25) # Te3[s2] + srl $25,$11,22 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +#endif + xor $12,$16 + lw $16,0($1) # Te0[s0>>24] + xor $13,$17 + lw $17,0($2) # Te0[s1>>24] + xor $14,$18 + lw $18,0($24) # Te0[s2>>24] + xor $15,$19 + lw $19,0($25) # Te0[s3>>24] + + xor $12,$20 + lw $8,0($3) + xor $13,$21 + lw $9,4($3) + xor $14,$22 + lw $10,8($3) + xor $15,$23 + lw $11,12($3) + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + sub $30,1 + add $3,16 + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + .set noreorder + bnez $30,.Loop_enc + srl $1,$9,14 +#endif + + .set reorder + srl $2,$10,14 + srl $24,$11,14 + srl $25,$8,14 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $12,2($1) # Te4[s1>>16] + srl $1,$10,6 + lbu $13,2($2) # Te4[s2>>16] + srl $2,$11,6 + lbu $14,2($24) # Te4[s3>>16] + srl $24,$8,6 + lbu $15,2($25) # Te4[s0>>16] + srl $25,$9,6 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) +# if defined(_MIPSEL) + lbu $16,2($1) # Te4[s2>>8] + ins $1,$8,2,8 + lbu $17,2($2) # Te4[s3>>8] + ins $2,$9,2,8 + lbu $18,2($24) # Te4[s0>>8] + ins $24,$10,2,8 + lbu $19,2($25) # Te4[s1>>8] + ins $25,$11,2,8 + + lbu $20,2($1) # Te4[s0>>24] + sll $1,$11,2 + lbu $21,2($2) # Te4[s1>>24] + sll $2,$8,2 + lbu $22,2($24) # Te4[s2>>24] + sll $24,$9,2 + lbu $23,2($25) # Te4[s3>>24] + sll $25,$10,2 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +# else + lbu $16,2($1) # Te4[s2>>8] + srl $1,$8,22 + lbu $17,2($2) # Te4[s3>>8] + srl $2,$9,22 + lbu $18,2($24) # Te4[s0>>8] + srl $24,$10,22 + lbu $19,2($25) # Te4[s1>>8] + srl $25,$11,22 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $20,2($1) # Te4[s0>>24] + ins $1,$11,2,8 + lbu $21,2($2) # Te4[s1>>24] + ins $2,$8,2,8 + lbu $22,2($24) # Te4[s2>>24] + ins $24,$9,2,8 + lbu $23,2($25) # Te4[s3>>24] + ins $25,$10,2,8 +# endif + sll $12,$12,16 + sll $13,$13,16 + sll $14,$14,16 + sll $15,$15,16 + + ins $12,$16,8,8 + lbu $16,2($1) # Te4[s3] + ins $13,$17,8,8 + lbu $17,2($2) # Te4[s0] + ins $14,$18,8,8 + lbu $18,2($24) # Te4[s1] + ins $15,$19,8,8 + lbu $19,2($25) # Te4[s2] + + ins $12,$20,24,8 + lw $8,0($3) + ins $13,$21,24,8 + lw $9,4($3) + ins $14,$22,24,8 + lw $10,8($3) + ins $15,$23,24,8 + lw $11,12($3) + + ins $12,$16,0,8 + ins $13,$17,0,8 + ins $14,$18,0,8 + ins $15,$19,0,8 +#else + lbu $16,2($1) # Te4[s2>>8] + srl $1,$8,22 + lbu $17,2($2) # Te4[s3>>8] + srl $2,$9,22 + lbu $18,2($24) # Te4[s0>>8] + srl $24,$10,22 + lbu $19,2($25) # Te4[s1>>8] + srl $25,$11,22 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $20,2($1) # Te4[s0>>24] + sll $1,$11,2 + lbu $21,2($2) # Te4[s1>>24] + sll $2,$8,2 + lbu $22,2($24) # Te4[s2>>24] + sll $24,$9,2 + lbu $23,2($25) # Te4[s3>>24] + sll $25,$10,2 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + + sll $12,$12,16 + sll $13,$13,16 + sll $14,$14,16 + sll $15,$15,16 + + sll $16,$16,8 + sll $17,$17,8 + sll $18,$18,8 + sll $19,$19,8 + + xor $12,$16 + lbu $16,2($1) # Te4[s3] + xor $13,$17 + lbu $17,2($2) # Te4[s0] + xor $14,$18 + lbu $18,2($24) # Te4[s1] + xor $15,$19 + lbu $19,2($25) # Te4[s2] + + sll $20,$20,24 + lw $8,0($3) + sll $21,$21,24 + lw $9,4($3) + sll $22,$22,24 + lw $10,8($3) + sll $23,$23,24 + lw $11,12($3) + + xor $12,$20 + xor $13,$21 + xor $14,$22 + xor $15,$23 + + #sll $16,$16,0 + #sll $17,$17,0 + #sll $18,$18,0 + #sll $19,$19,0 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 +#endif + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + + jr $31 +.end _mips_AES_encrypt + +.align 5 +.globl AES_encrypt +.ent AES_encrypt +AES_encrypt: + .frame $29,64,$31 + .mask 0xc0ff0000,-4 + .set noreorder + .cpload $25 + sub $29,64 + sw $31,64-1*4($29) + sw $30,64-2*4($29) + sw $23,64-3*4($29) + sw $22,64-4*4($29) + sw $21,64-5*4($29) + sw $20,64-6*4($29) + sw $19,64-7*4($29) + sw $18,64-8*4($29) + sw $17,64-9*4($29) + sw $16,64-10*4($29) + .set reorder + la $7,AES_Te # PIC-ified 'load address' + + lwl $8,0+0($4) + lwl $9,4+0($4) + lwl $10,8+0($4) + lwl $11,12+0($4) + lwr $8,0+3($4) + lwr $9,4+3($4) + lwr $10,8+3($4) + lwr $11,12+3($4) + + bal _mips_AES_encrypt + + swr $8,0+3($5) + swr $9,4+3($5) + swr $10,8+3($5) + swr $11,12+3($5) + swl $8,0+0($5) + swl $9,4+0($5) + swl $10,8+0($5) + swl $11,12+0($5) + + .set noreorder + lw $31,64-1*4($29) + lw $30,64-2*4($29) + lw $23,64-3*4($29) + lw $22,64-4*4($29) + lw $21,64-5*4($29) + lw $20,64-6*4($29) + lw $19,64-7*4($29) + lw $18,64-8*4($29) + lw $17,64-9*4($29) + lw $16,64-10*4($29) + jr $31 + add $29,64 +.end AES_encrypt +.align 5 +.ent _mips_AES_decrypt +_mips_AES_decrypt: + .frame $29,0,$31 + .set reorder + lw $12,0($6) + lw $13,4($6) + lw $14,8($6) + lw $15,12($6) + lw $30,240($6) + add $3,$6,16 + + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + + sub $30,1 +#if defined(__mips_smartmips) + ext $1,$11,16,8 +.Loop_dec: + ext $2,$8,16,8 + ext $24,$9,16,8 + ext $25,$10,16,8 + lwxs $12,$1($7) # Td1[s3>>16] + ext $1,$10,8,8 + lwxs $13,$2($7) # Td1[s0>>16] + ext $2,$11,8,8 + lwxs $14,$24($7) # Td1[s1>>16] + ext $24,$8,8,8 + lwxs $15,$25($7) # Td1[s2>>16] + ext $25,$9,8,8 + + lwxs $16,$1($7) # Td2[s2>>8] + ext $1,$9,0,8 + lwxs $17,$2($7) # Td2[s3>>8] + ext $2,$10,0,8 + lwxs $18,$24($7) # Td2[s0>>8] + ext $24,$11,0,8 + lwxs $19,$25($7) # Td2[s1>>8] + ext $25,$8,0,8 + + lwxs $20,$1($7) # Td3[s1] + ext $1,$8,24,8 + lwxs $21,$2($7) # Td3[s2] + ext $2,$9,24,8 + lwxs $22,$24($7) # Td3[s3] + ext $24,$10,24,8 + lwxs $23,$25($7) # Td3[s0] + ext $25,$11,24,8 + + rotr $12,$12,8 + rotr $13,$13,8 + rotr $14,$14,8 + rotr $15,$15,8 + + rotr $16,$16,16 + rotr $17,$17,16 + rotr $18,$18,16 + rotr $19,$19,16 + + xor $12,$16 + lwxs $16,$1($7) # Td0[s0>>24] + xor $13,$17 + lwxs $17,$2($7) # Td0[s1>>24] + xor $14,$18 + lwxs $18,$24($7) # Td0[s2>>24] + xor $15,$19 + lwxs $19,$25($7) # Td0[s3>>24] + + rotr $20,$20,24 + lw $8,0($3) + rotr $21,$21,24 + lw $9,4($3) + rotr $22,$22,24 + lw $10,8($3) + rotr $23,$23,24 + lw $11,12($3) + + xor $12,$20 + xor $13,$21 + xor $14,$22 + xor $15,$23 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + sub $30,1 + add $3,16 + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + .set noreorder + bnez $30,.Loop_dec + ext $1,$11,16,8 + + srl $1,$11,14 +#else + srl $1,$11,14 +.Loop_dec: + srl $2,$8,14 + srl $24,$9,14 + srl $25,$10,14 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + lw $12,0($1) # Td1[s3>>16] + srl $1,$10,6 + lw $13,0($2) # Td1[s0>>16] + srl $2,$11,6 + lw $14,0($24) # Td1[s1>>16] + srl $24,$8,6 + lw $15,0($25) # Td1[s2>>16] + srl $25,$9,6 +#else + lwl $12,3($1) # Td1[s3>>16] + lwl $13,3($2) # Td1[s0>>16] + lwl $14,3($24) # Td1[s1>>16] + lwl $15,3($25) # Td1[s2>>16] + lwr $12,2($1) # Td1[s3>>16] + srl $1,$10,6 + lwr $13,2($2) # Td1[s0>>16] + srl $2,$11,6 + lwr $14,2($24) # Td1[s1>>16] + srl $24,$8,6 + lwr $15,2($25) # Td1[s2>>16] + srl $25,$9,6 +#endif + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + rotr $12,$12,8 + rotr $13,$13,8 + rotr $14,$14,8 + rotr $15,$15,8 +# if defined(_MIPSEL) + lw $16,0($1) # Td2[s2>>8] + sll $1,$9,2 + lw $17,0($2) # Td2[s3>>8] + sll $2,$10,2 + lw $18,0($24) # Td2[s0>>8] + sll $24,$11,2 + lw $19,0($25) # Td2[s1>>8] + sll $25,$8,2 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lw $20,0($1) # Td3[s1] + ins $1,$8,2,8 + lw $21,0($2) # Td3[s2] + ins $2,$9,2,8 + lw $22,0($24) # Td3[s3] + ins $24,$10,2,8 + lw $23,0($25) # Td3[s0] + ins $25,$11,2,8 +#else + lw $16,0($1) # Td2[s2>>8] + ins $1,$9,2,8 + lw $17,0($2) # Td2[s3>>8] + ins $2,$10,2,8 + lw $18,0($24) # Td2[s0>>8] + ins $24,$11,2,8 + lw $19,0($25) # Td2[s1>>8] + ins $25,$8,2,8 + + lw $20,0($1) # Td3[s1] + srl $1,$8,22 + lw $21,0($2) # Td3[s2] + srl $2,$9,22 + lw $22,0($24) # Td3[s3] + srl $24,$10,22 + lw $23,0($25) # Td3[s0] + srl $25,$11,22 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +#endif + rotr $16,$16,16 + rotr $17,$17,16 + rotr $18,$18,16 + rotr $19,$19,16 + + rotr $20,$20,24 + rotr $21,$21,24 + rotr $22,$22,24 + rotr $23,$23,24 +#else + lwl $16,2($1) # Td2[s2>>8] + lwl $17,2($2) # Td2[s3>>8] + lwl $18,2($24) # Td2[s0>>8] + lwl $19,2($25) # Td2[s1>>8] + lwr $16,1($1) # Td2[s2>>8] + sll $1,$9,2 + lwr $17,1($2) # Td2[s3>>8] + sll $2,$10,2 + lwr $18,1($24) # Td2[s0>>8] + sll $24,$11,2 + lwr $19,1($25) # Td2[s1>>8] + sll $25,$8,2 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lwl $20,1($1) # Td3[s1] + lwl $21,1($2) # Td3[s2] + lwl $22,1($24) # Td3[s3] + lwl $23,1($25) # Td3[s0] + lwr $20,0($1) # Td3[s1] + srl $1,$8,22 + lwr $21,0($2) # Td3[s2] + srl $2,$9,22 + lwr $22,0($24) # Td3[s3] + srl $24,$10,22 + lwr $23,0($25) # Td3[s0] + srl $25,$11,22 + + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +#endif + + xor $12,$16 + lw $16,0($1) # Td0[s0>>24] + xor $13,$17 + lw $17,0($2) # Td0[s1>>24] + xor $14,$18 + lw $18,0($24) # Td0[s2>>24] + xor $15,$19 + lw $19,0($25) # Td0[s3>>24] + + xor $12,$20 + lw $8,0($3) + xor $13,$21 + lw $9,4($3) + xor $14,$22 + lw $10,8($3) + xor $15,$23 + lw $11,12($3) + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + sub $30,1 + add $3,16 + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + .set noreorder + bnez $30,.Loop_dec + srl $1,$11,14 +#endif + + .set reorder + lw $16,1024($7) # prefetch Td4 + srl $1,$11,16 + lw $17,1024+32($7) + srl $2,$8,16 + lw $18,1024+64($7) + srl $24,$9,16 + lw $19,1024+96($7) + srl $25,$10,16 + lw $20,1024+128($7) + and $1,0xff + lw $21,1024+160($7) + and $2,0xff + lw $22,1024+192($7) + and $24,0xff + lw $23,1024+224($7) + and $25,0xff + + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $12,1024($1) # Td4[s3>>16] + srl $1,$10,8 + lbu $13,1024($2) # Td4[s0>>16] + srl $2,$11,8 + lbu $14,1024($24) # Td4[s1>>16] + srl $24,$8,8 + lbu $15,1024($25) # Td4[s2>>16] + srl $25,$9,8 + + and $1,0xff + and $2,0xff + and $24,0xff + and $25,0xff + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) +# if defined(_MIPSEL) + lbu $16,1024($1) # Td4[s2>>8] + ins $1,$8,0,8 + lbu $17,1024($2) # Td4[s3>>8] + ins $2,$9,0,8 + lbu $18,1024($24) # Td4[s0>>8] + ins $24,$10,0,8 + lbu $19,1024($25) # Td4[s1>>8] + ins $25,$11,0,8 + + lbu $20,1024($1) # Td4[s0>>24] + and $1,$9,0xff + lbu $21,1024($2) # Td4[s1>>24] + and $2,$10,0xff + lbu $22,1024($24) # Td4[s2>>24] + and $24,$11,0xff + lbu $23,1024($25) # Td4[s3>>24] + and $25,$8,0xff + + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 +# else + lbu $16,1024($1) # Td4[s2>>8] + srl $1,$8,24 + lbu $17,1024($2) # Td4[s3>>8] + srl $2,$9,24 + lbu $18,1024($24) # Td4[s0>>8] + srl $24,$10,24 + lbu $19,1024($25) # Td4[s1>>8] + srl $25,$11,24 + + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $20,1024($1) # Td4[s0>>24] + ins $1,$9,0,8 + lbu $21,1024($2) # Td4[s1>>24] + ins $2,$10,0,8 + lbu $22,1024($24) # Td4[s2>>24] + ins $24,$11,0,8 + lbu $23,1024($25) # Td4[s3>>24] + ins $25,$8,0,8 +# endif + sll $12,$12,16 + sll $13,$13,16 + sll $14,$14,16 + sll $15,$15,16 + + ins $12,$16,8,8 + lbu $16,1024($1) # Td4[s1] + ins $13,$17,8,8 + lbu $17,1024($2) # Td4[s2] + ins $14,$18,8,8 + lbu $18,1024($24) # Td4[s3] + ins $15,$19,8,8 + lbu $19,1024($25) # Td4[s0] + + ins $12,$20,24,8 + lw $8,0($3) + ins $13,$21,24,8 + lw $9,4($3) + ins $14,$22,24,8 + lw $10,8($3) + ins $15,$23,24,8 + lw $11,12($3) + + ins $12,$16,0,8 + ins $13,$17,0,8 + ins $14,$18,0,8 + ins $15,$19,0,8 +#else + lbu $16,1024($1) # Td4[s2>>8] + srl $1,$8,24 + lbu $17,1024($2) # Td4[s3>>8] + srl $2,$9,24 + lbu $18,1024($24) # Td4[s0>>8] + srl $24,$10,24 + lbu $19,1024($25) # Td4[s1>>8] + srl $25,$11,24 + + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $20,1024($1) # Td4[s0>>24] + and $1,$9,0xff + lbu $21,1024($2) # Td4[s1>>24] + and $2,$10,0xff + lbu $22,1024($24) # Td4[s2>>24] + and $24,$11,0xff + lbu $23,1024($25) # Td4[s3>>24] + and $25,$8,0xff + + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + + sll $12,$12,16 + sll $13,$13,16 + sll $14,$14,16 + sll $15,$15,16 + + sll $16,$16,8 + sll $17,$17,8 + sll $18,$18,8 + sll $19,$19,8 + + xor $12,$16 + lbu $16,1024($1) # Td4[s1] + xor $13,$17 + lbu $17,1024($2) # Td4[s2] + xor $14,$18 + lbu $18,1024($24) # Td4[s3] + xor $15,$19 + lbu $19,1024($25) # Td4[s0] + + sll $20,$20,24 + lw $8,0($3) + sll $21,$21,24 + lw $9,4($3) + sll $22,$22,24 + lw $10,8($3) + sll $23,$23,24 + lw $11,12($3) + + xor $12,$20 + xor $13,$21 + xor $14,$22 + xor $15,$23 + + #sll $16,$16,0 + #sll $17,$17,0 + #sll $18,$18,0 + #sll $19,$19,0 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 +#endif + + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + + jr $31 +.end _mips_AES_decrypt + +.align 5 +.globl AES_decrypt +.ent AES_decrypt +AES_decrypt: + .frame $29,64,$31 + .mask 0xc0ff0000,-4 + .set noreorder + .cpload $25 + sub $29,64 + sw $31,64-1*4($29) + sw $30,64-2*4($29) + sw $23,64-3*4($29) + sw $22,64-4*4($29) + sw $21,64-5*4($29) + sw $20,64-6*4($29) + sw $19,64-7*4($29) + sw $18,64-8*4($29) + sw $17,64-9*4($29) + sw $16,64-10*4($29) + .set reorder + la $7,AES_Td # PIC-ified 'load address' + + lwl $8,0+0($4) + lwl $9,4+0($4) + lwl $10,8+0($4) + lwl $11,12+0($4) + lwr $8,0+3($4) + lwr $9,4+3($4) + lwr $10,8+3($4) + lwr $11,12+3($4) + + bal _mips_AES_decrypt + + swr $8,0+3($5) + swr $9,4+3($5) + swr $10,8+3($5) + swr $11,12+3($5) + swl $8,0+0($5) + swl $9,4+0($5) + swl $10,8+0($5) + swl $11,12+0($5) + + .set noreorder + lw $31,64-1*4($29) + lw $30,64-2*4($29) + lw $23,64-3*4($29) + lw $22,64-4*4($29) + lw $21,64-5*4($29) + lw $20,64-6*4($29) + lw $19,64-7*4($29) + lw $18,64-8*4($29) + lw $17,64-9*4($29) + lw $16,64-10*4($29) + jr $31 + add $29,64 +.end AES_decrypt +.align 5 +.ent _mips_AES_set_encrypt_key +_mips_AES_set_encrypt_key: + .frame $29,0,$31 + .set noreorder + beqz $4,.Lekey_done + li $2,-1 + beqz $6,.Lekey_done + add $3,$7,256 + + .set reorder + lwl $8,0+0($4) # load 128 bits + lwl $9,4+0($4) + lwl $10,8+0($4) + lwl $11,12+0($4) + li $1,128 + lwr $8,0+3($4) + lwr $9,4+3($4) + lwr $10,8+3($4) + lwr $11,12+3($4) + .set noreorder + beq $5,$1,.L128bits + li $30,10 + + .set reorder + lwl $12,16+0($4) # load 192 bits + lwl $13,20+0($4) + li $1,192 + lwr $12,16+3($4) + lwr $13,20+3($4) + .set noreorder + beq $5,$1,.L192bits + li $30,8 + + .set reorder + lwl $14,24+0($4) # load 256 bits + lwl $15,28+0($4) + li $1,256 + lwr $14,24+3($4) + lwr $15,28+3($4) + .set noreorder + beq $5,$1,.L256bits + li $30,7 + + b .Lekey_done + li $2,-2 + +.align 4 +.L128bits: + .set reorder + srl $1,$11,16 + srl $2,$11,8 + and $1,0xff + and $2,0xff + and $24,$11,0xff + srl $25,$11,24 + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $1,0($1) + lbu $2,0($2) + lbu $24,0($24) + lbu $25,0($25) + + sw $8,0($6) + sw $9,4($6) + sw $10,8($6) + sw $11,12($6) + sub $30,1 + add $6,16 + + sll $1,$1,24 + sll $2,$2,16 + sll $24,$24,8 + #sll $25,$25,0 + + xor $8,$1 + lw $1,0($3) + xor $8,$2 + xor $8,$24 + xor $8,$25 + xor $8,$1 + + xor $9,$8 + xor $10,$9 + xor $11,$10 + + .set noreorder + bnez $30,.L128bits + add $3,4 + + sw $8,0($6) + sw $9,4($6) + sw $10,8($6) + li $30,10 + sw $11,12($6) + li $2,0 + sw $30,80($6) + b .Lekey_done + sub $6,10*16 + +.align 4 +.L192bits: + .set reorder + srl $1,$13,16 + srl $2,$13,8 + and $1,0xff + and $2,0xff + and $24,$13,0xff + srl $25,$13,24 + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $1,0($1) + lbu $2,0($2) + lbu $24,0($24) + lbu $25,0($25) + + sw $8,0($6) + sw $9,4($6) + sw $10,8($6) + sw $11,12($6) + sw $12,16($6) + sw $13,20($6) + sub $30,1 + add $6,24 + + sll $1,$1,24 + sll $2,$2,16 + sll $24,$24,8 + #sll $25,$25,0 + + xor $8,$1 + lw $1,0($3) + xor $8,$2 + xor $8,$24 + xor $8,$25 + xor $8,$1 + + xor $9,$8 + xor $10,$9 + xor $11,$10 + xor $12,$11 + xor $13,$12 + + .set noreorder + bnez $30,.L192bits + add $3,4 + + sw $8,0($6) + sw $9,4($6) + sw $10,8($6) + li $30,12 + sw $11,12($6) + li $2,0 + sw $30,48($6) + b .Lekey_done + sub $6,12*16 + +.align 4 +.L256bits: + .set reorder + srl $1,$15,16 + srl $2,$15,8 + and $1,0xff + and $2,0xff + and $24,$15,0xff + srl $25,$15,24 + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $1,0($1) + lbu $2,0($2) + lbu $24,0($24) + lbu $25,0($25) + + sw $8,0($6) + sw $9,4($6) + sw $10,8($6) + sw $11,12($6) + sw $12,16($6) + sw $13,20($6) + sw $14,24($6) + sw $15,28($6) + sub $30,1 + + sll $1,$1,24 + sll $2,$2,16 + sll $24,$24,8 + #sll $25,$25,0 + + xor $8,$1 + lw $1,0($3) + xor $8,$2 + xor $8,$24 + xor $8,$25 + xor $8,$1 + + xor $9,$8 + xor $10,$9 + xor $11,$10 + beqz $30,.L256bits_done + + srl $1,$11,24 + srl $2,$11,16 + srl $24,$11,8 + and $25,$11,0xff + and $2,0xff + and $24,0xff + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $1,0($1) + lbu $2,0($2) + lbu $24,0($24) + lbu $25,0($25) + sll $1,24 + sll $2,16 + sll $24,8 + + xor $12,$1 + xor $12,$2 + xor $12,$24 + xor $12,$25 + + xor $13,$12 + xor $14,$13 + xor $15,$14 + + add $6,32 + .set noreorder + b .L256bits + add $3,4 + +.L256bits_done: + sw $8,32($6) + sw $9,36($6) + sw $10,40($6) + li $30,14 + sw $11,44($6) + li $2,0 + sw $30,48($6) + sub $6,12*16 + +.Lekey_done: + jr $31 + nop +.end _mips_AES_set_encrypt_key + +.globl AES_set_encrypt_key +.ent AES_set_encrypt_key +AES_set_encrypt_key: + .frame $29,32,$31 + .mask 0xc0000000,-4 + .set noreorder + .cpload $25 + sub $29,32 + sw $31,32-1*4($29) + sw $30,32-2*4($29) + .set reorder + la $7,AES_Te4 # PIC-ified 'load address' + + bal _mips_AES_set_encrypt_key + + .set noreorder + move $4,$2 + lw $31,32-1*4($29) + lw $30,32-2*4($29) + jr $31 + add $29,32 +.end AES_set_encrypt_key +.align 5 +.globl AES_set_decrypt_key +.ent AES_set_decrypt_key +AES_set_decrypt_key: + .frame $29,32,$31 + .mask 0xc0000000,-4 + .set noreorder + .cpload $25 + sub $29,32 + sw $31,32-1*4($29) + sw $30,32-2*4($29) + .set reorder + la $7,AES_Te4 # PIC-ified 'load address' + + bal _mips_AES_set_encrypt_key + + bltz $2,.Ldkey_done + + sll $1,$30,4 + add $4,$6,0 + add $5,$6,$1 +.align 4 +.Lswap: + lw $8,0($4) + lw $9,4($4) + lw $10,8($4) + lw $11,12($4) + lw $12,0($5) + lw $13,4($5) + lw $14,8($5) + lw $15,12($5) + sw $8,0($5) + sw $9,4($5) + sw $10,8($5) + sw $11,12($5) + add $4,16 + sub $5,16 + sw $12,-16($4) + sw $13,-12($4) + sw $14,-8($4) + sw $15,-4($4) + bne $4,$5,.Lswap + + lw $8,16($6) # modulo-scheduled + lui $2,0x8080 + sub $30,1 + or $2,0x8080 + sll $30,2 + add $6,16 + lui $25,0x1b1b + nor $24,$0,$2 + or $25,0x1b1b +.align 4 +.Lmix: + and $1,$8,$2 + and $9,$8,$24 + srl $10,$1,7 + addu $9,$9 # tp2<<1 + subu $1,$10 + and $1,$25 + xor $9,$1 + + and $1,$9,$2 + and $10,$9,$24 + srl $11,$1,7 + addu $10,$10 # tp4<<1 + subu $1,$11 + and $1,$25 + xor $10,$1 + + and $1,$10,$2 + and $11,$10,$24 + srl $12,$1,7 + addu $11,$11 # tp8<<1 + subu $1,$12 + and $1,$25 + xor $11,$1 + + xor $12,$11,$8 + xor $15,$11,$10 + xor $13,$12,$9 + xor $14,$12,$10 + +#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) + rotr $8,$14,16 + xor $15,$9 + rotr $9,$12,8 + xor $15,$8 + rotr $10,$13,24 + xor $15,$9 + lw $8,4($6) # modulo-scheduled + xor $15,$10 +#else + srl $8,$14,16 + xor $15,$9 + sll $9,$14,16 + xor $15,$8 + srl $8,$12,8 + xor $15,$9 + sll $9,$12,24 + xor $15,$8 + srl $8,$13,24 + xor $15,$9 + sll $9,$13,8 + xor $15,$8 + lw $8,4($6) # modulo-scheduled + xor $15,$9 +#endif + sub $30,1 + sw $15,0($6) + add $6,4 + bnez $30,.Lmix + + li $2,0 +.Ldkey_done: + .set noreorder + move $4,$2 + lw $31,32-1*4($29) + lw $30,32-2*4($29) + jr $31 + add $29,32 +.end AES_set_decrypt_key +.rdata +.align 10 +AES_Te: +.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0 +.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d +.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd +.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 +.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 +.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d +.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 +.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a +.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d +.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 +.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb +.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b +.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 +.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea +.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 +.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b +.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c +.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a +.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 +.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f +.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 +.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 +.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 +.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f +.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 +.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e +.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 +.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 +.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 +.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d +.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 +.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f +.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e +.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e +.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 +.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb +.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d +.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce +.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e +.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 +.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 +.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c +.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f +.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed +.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 +.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b +.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 +.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a +.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a +.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 +.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 +.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 +.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 +.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 +.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 +.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 +.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe +.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a +.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc +.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 +.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 +.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 +.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a +.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d +.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 +.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f +.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 +.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 +.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 +.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 +.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 +.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 +.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 +.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f +.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e +.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 +.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 +.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c +.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 +.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 +.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 +.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e +.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a +.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 +.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e +.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 +.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 +.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b +.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 +.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 +.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 +.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 +.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa +.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 +.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e +.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 +.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 +.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 +.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 +.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 +.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c +.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 +.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc +.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 +.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 +.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa +.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 +.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 +.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f +.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 +.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 +.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 +.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 +.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 +.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 +.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 +.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 +.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 +.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff +.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a +.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 +.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 +.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 +.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 +.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 +.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 +.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc +.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a + +AES_Td: +.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0 +.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 +.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 +.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 +.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 +.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 +.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 +.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f +.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 +.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 +.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 +.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 +.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 +.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda +.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 +.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 +.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 +.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd +.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 +.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 +.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 +.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 +.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 +.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 +.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 +.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 +.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 +.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a +.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 +.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 +.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 +.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c +.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 +.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 +.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 +.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a +.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 +.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 +.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa +.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 +.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d +.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 +.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 +.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff +.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 +.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 +.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 +.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb +.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 +.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 +.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 +.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e +.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 +.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 +.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 +.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a +.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f +.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e +.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 +.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 +.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 +.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d +.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad +.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 +.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c +.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd +.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc +.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 +.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc +.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 +.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 +.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 +.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 +.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d +.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 +.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 +.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 +.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 +.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a +.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef +.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 +.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 +.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 +.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 +.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d +.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 +.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 +.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 +.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c +.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 +.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 +.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b +.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 +.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 +.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e +.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 +.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce +.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 +.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 +.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 +.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 +.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 +.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 +.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f +.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d +.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf +.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b +.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f +.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d +.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e +.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 +.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 +.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a +.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 +.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 +.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c +.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f +.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf +.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b +.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 +.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e +.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f +.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c +.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 +.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde +.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 +.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 +.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 + +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + +AES_Te4: +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + +.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon +.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 +.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 +.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 +.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 From 4d797fb55dda1eb6e9874905bafe36ac674d75e9 Mon Sep 17 00:00:00 2001 From: Linus Yang Date: Fri, 18 Aug 2017 22:27:03 +0800 Subject: [PATCH 2/2] Only use assembly code --- lib/aes_acc/aes0.c | 600 -------------------- lib/aes_acc/aes0.h | 45 -- lib/aes_acc/aesacc.c | 197 +++---- lib/aes_acc/aesacc.h | 20 - lib/aes_acc/aesni.c | 13 +- lib/aes_acc/aesni.h | 12 +- lib/aes_acc/asm/arm64.S | 1178 +++++++++++++++++++++++++++++++++++++++ lib/aes_acc/asm/x64.S | 827 +++++++++++++++++++++++++++ makefile | 8 +- 9 files changed, 2108 insertions(+), 792 deletions(-) delete mode 100755 lib/aes_acc/aes0.c delete mode 100755 lib/aes_acc/aes0.h delete mode 100644 lib/aes_acc/aesacc.h create mode 100644 lib/aes_acc/asm/arm64.S create mode 100644 lib/aes_acc/asm/x64.S diff --git a/lib/aes_acc/aes0.c b/lib/aes_acc/aes0.c deleted file mode 100755 index 2fdf02b..0000000 --- a/lib/aes_acc/aes0.c +++ /dev/null @@ -1,600 +0,0 @@ - -/* - * this file comes from https://github.com/kokke/tiny-AES128-C - */ - -/* - -This is an implementation of the AES algorithm, specifically ECB and CBC mode. -Block size can be chosen in aes.h - available choices are AES128, AES192, AES256. - -The implementation is verified against the test vectors in: - National Institute of Standards and Technology Special Publication 800-38A 2001 ED - -ECB-AES128 ----------- - - plain-text: - 6bc1bee22e409f96e93d7e117393172a - ae2d8a571e03ac9c9eb76fac45af8e51 - 30c81c46a35ce411e5fbc1191a0a52ef - f69f2445df4f9b17ad2b417be66c3710 - - key: - 2b7e151628aed2a6abf7158809cf4f3c - - resulting cipher - 3ad77bb40d7a3660a89ecaf32466ef97 - f5d3d58503b9699de785895a96fdbaaf - 43b1cd7f598ece23881b00e3ed030688 - 7b0c785e27e8ad3f8223207104725dd4 - - -NOTE: String length must be evenly divisible by 16byte (str_len % 16 == 0) - You should pad the end of the string with zeros if this is not the case. - For AES192/256 the block size is proportionally larger. - -*/ - - -/*****************************************************************************/ -/* Includes: */ -/*****************************************************************************/ -#include -#include // CBC mode, for memset -#include "aes0.h" - -/*****************************************************************************/ -/* Defines: */ -/*****************************************************************************/ -// The number of columns comprising a state in AES. This is a constant in AES. Value=4 -#define Nb 4 -#define BLOCKLEN 16 //Block length in bytes AES is 128b block only - -#if defined(AES256) && (AES256 == 1) - #define Nk 8 - #define KEYLEN 32 - #define Nr 14 - #define keyExpSize 240 -#elif defined(AES192) && (AES192 == 1) - #define Nk 6 - #define KEYLEN 24 - #define Nr 12 - #define keyExpSize 208 -#else - #define Nk 4 // The number of 32 bit words in a key. - #define KEYLEN 16 // Key length in bytes - #define Nr 10 // The number of rounds in AES Cipher. - #define keyExpSize 176 -#endif - -// jcallan@github points out that declaring Multiply as a function -// reduces code size considerably with the Keil ARM compiler. -// See this link for more information: https://github.com/kokke/tiny-AES128-C/pull/3 -#ifndef MULTIPLY_AS_A_FUNCTION - #define MULTIPLY_AS_A_FUNCTION 0 -#endif - - -/*****************************************************************************/ -/* Private variables: */ -/*****************************************************************************/ -// state - array holding the intermediate results during decryption. -typedef uint8_t state_t[4][4]; -static state_t* state; - -// The array that stores the round keys. -static uint8_t RoundKey[keyExpSize]; - -// The Key input to the AES Program -static const uint8_t* Key; - -#if defined(CBC) && CBC - // Initial Vector used only for CBC mode - static uint8_t* Iv; -#endif - -// The lookup-tables are marked const so they can be placed in read-only storage instead of RAM -// The numbers below can be computed dynamically trading ROM for RAM - -// This can be useful in (embedded) bootloader applications, where ROM is often limited. -static const uint8_t sbox[256] = { - //0 1 2 3 4 5 6 7 8 9 A B C D E F - 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, - 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, - 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, - 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, - 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, - 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, - 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, - 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, - 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, - 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, - 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, - 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, - 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, - 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, - 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, - 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; - -static const uint8_t rsbox[256] = { - 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, - 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, - 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, - 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, - 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, - 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, - 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, - 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, - 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, - 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, - 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, - 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, - 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, - 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, - 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, - 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d }; - -// The round constant word array, Rcon[i], contains the values given by -// x to th e power (i-1) being powers of x (x is denoted as {02}) in the field GF(2^8) -static const uint8_t Rcon[11] = { - 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 }; - -/* - * Jordan Goulder points out in PR #12 (https://github.com/kokke/tiny-AES128-C/pull/12), - * that you can remove most of the elements in the Rcon array, because they are unused. - * - * From Wikipedia's article on the Rijndael key schedule @ https://en.wikipedia.org/wiki/Rijndael_key_schedule#Rcon - * - * "Only the first some of these constants are actually used – up to rcon[10] for AES-128 (as 11 round keys are needed), - * up to rcon[8] for AES-192, up to rcon[7] for AES-256. rcon[0] is not used in AES algorithm." - * - * ... which is why the full array below has been 'disabled' below. - */ -#if 0 -static const uint8_t Rcon[256] = { - 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, - 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, - 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, - 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, - 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, - 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, - 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, - 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, - 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, - 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, - 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, - 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, - 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, - 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, - 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, - 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d }; -#endif - -/*****************************************************************************/ -/* Private functions: */ -/*****************************************************************************/ -static uint8_t getSBoxValue(uint8_t num) -{ - return sbox[num]; -} - -static uint8_t getSBoxInvert(uint8_t num) -{ - return rsbox[num]; -} - -// This function produces Nb(Nr+1) round keys. The round keys are used in each round to decrypt the states. -static void KeyExpansion(void) -{ - uint32_t i, k; - uint8_t tempa[4]; // Used for the column/row operations - - // The first round key is the key itself. - for (i = 0; i < Nk; ++i) - { - RoundKey[(i * 4) + 0] = Key[(i * 4) + 0]; - RoundKey[(i * 4) + 1] = Key[(i * 4) + 1]; - RoundKey[(i * 4) + 2] = Key[(i * 4) + 2]; - RoundKey[(i * 4) + 3] = Key[(i * 4) + 3]; - } - - // All other round keys are found from the previous round keys. - //i == Nk - for (; i < Nb * (Nr + 1); ++i) - { - { - tempa[0]=RoundKey[(i-1) * 4 + 0]; - tempa[1]=RoundKey[(i-1) * 4 + 1]; - tempa[2]=RoundKey[(i-1) * 4 + 2]; - tempa[3]=RoundKey[(i-1) * 4 + 3]; - } - - if (i % Nk == 0) - { - // This function shifts the 4 bytes in a word to the left once. - // [a0,a1,a2,a3] becomes [a1,a2,a3,a0] - - // Function RotWord() - { - k = tempa[0]; - tempa[0] = tempa[1]; - tempa[1] = tempa[2]; - tempa[2] = tempa[3]; - tempa[3] = k; - } - - // SubWord() is a function that takes a four-byte input word and - // applies the S-box to each of the four bytes to produce an output word. - - // Function Subword() - { - tempa[0] = getSBoxValue(tempa[0]); - tempa[1] = getSBoxValue(tempa[1]); - tempa[2] = getSBoxValue(tempa[2]); - tempa[3] = getSBoxValue(tempa[3]); - } - - tempa[0] = tempa[0] ^ Rcon[i/Nk]; - } -#if defined(AES256) && (AES256 == 1) - if (i % Nk == 4) - { - // Function Subword() - { - tempa[0] = getSBoxValue(tempa[0]); - tempa[1] = getSBoxValue(tempa[1]); - tempa[2] = getSBoxValue(tempa[2]); - tempa[3] = getSBoxValue(tempa[3]); - } - } -#endif - RoundKey[i * 4 + 0] = RoundKey[(i - Nk) * 4 + 0] ^ tempa[0]; - RoundKey[i * 4 + 1] = RoundKey[(i - Nk) * 4 + 1] ^ tempa[1]; - RoundKey[i * 4 + 2] = RoundKey[(i - Nk) * 4 + 2] ^ tempa[2]; - RoundKey[i * 4 + 3] = RoundKey[(i - Nk) * 4 + 3] ^ tempa[3]; - } -} - -// This function adds the round key to state. -// The round key is added to the state by an XOR function. -static void AddRoundKey(uint8_t round) -{ - uint8_t i,j; - for (i=0;i<4;++i) - { - for (j = 0; j < 4; ++j) - { - (*state)[i][j] ^= RoundKey[round * Nb * 4 + i * Nb + j]; - } - } -} - -// The SubBytes Function Substitutes the values in the -// state matrix with values in an S-box. -static void SubBytes(void) -{ - uint8_t i, j; - for (i = 0; i < 4; ++i) - { - for (j = 0; j < 4; ++j) - { - (*state)[j][i] = getSBoxValue((*state)[j][i]); - } - } -} - -// The ShiftRows() function shifts the rows in the state to the left. -// Each row is shifted with different offset. -// Offset = Row number. So the first row is not shifted. -static void ShiftRows(void) -{ - uint8_t temp; - - // Rotate first row 1 columns to left - temp = (*state)[0][1]; - (*state)[0][1] = (*state)[1][1]; - (*state)[1][1] = (*state)[2][1]; - (*state)[2][1] = (*state)[3][1]; - (*state)[3][1] = temp; - - // Rotate second row 2 columns to left - temp = (*state)[0][2]; - (*state)[0][2] = (*state)[2][2]; - (*state)[2][2] = temp; - - temp = (*state)[1][2]; - (*state)[1][2] = (*state)[3][2]; - (*state)[3][2] = temp; - - // Rotate third row 3 columns to left - temp = (*state)[0][3]; - (*state)[0][3] = (*state)[3][3]; - (*state)[3][3] = (*state)[2][3]; - (*state)[2][3] = (*state)[1][3]; - (*state)[1][3] = temp; -} - -static uint8_t xtime(uint8_t x) -{ - return ((x<<1) ^ (((x>>7) & 1) * 0x1b)); -} - -// MixColumns function mixes the columns of the state matrix -static void MixColumns(void) -{ - uint8_t i; - uint8_t Tmp,Tm,t; - for (i = 0; i < 4; ++i) - { - t = (*state)[i][0]; - Tmp = (*state)[i][0] ^ (*state)[i][1] ^ (*state)[i][2] ^ (*state)[i][3] ; - Tm = (*state)[i][0] ^ (*state)[i][1] ; Tm = xtime(Tm); (*state)[i][0] ^= Tm ^ Tmp ; - Tm = (*state)[i][1] ^ (*state)[i][2] ; Tm = xtime(Tm); (*state)[i][1] ^= Tm ^ Tmp ; - Tm = (*state)[i][2] ^ (*state)[i][3] ; Tm = xtime(Tm); (*state)[i][2] ^= Tm ^ Tmp ; - Tm = (*state)[i][3] ^ t ; Tm = xtime(Tm); (*state)[i][3] ^= Tm ^ Tmp ; - } -} - -// Multiply is used to multiply numbers in the field GF(2^8) -#if MULTIPLY_AS_A_FUNCTION -static uint8_t Multiply(uint8_t x, uint8_t y) -{ - return (((y & 1) * x) ^ - ((y>>1 & 1) * xtime(x)) ^ - ((y>>2 & 1) * xtime(xtime(x))) ^ - ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^ - ((y>>4 & 1) * xtime(xtime(xtime(xtime(x)))))); - } -#else -#define Multiply(x, y) \ - ( ((y & 1) * x) ^ \ - ((y>>1 & 1) * xtime(x)) ^ \ - ((y>>2 & 1) * xtime(xtime(x))) ^ \ - ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^ \ - ((y>>4 & 1) * xtime(xtime(xtime(xtime(x)))))) \ - -#endif - -// MixColumns function mixes the columns of the state matrix. -// The method used to multiply may be difficult to understand for the inexperienced. -// Please use the references to gain more information. -static void InvMixColumns(void) -{ - int i; - uint8_t a, b, c, d; - for (i = 0; i < 4; ++i) - { - a = (*state)[i][0]; - b = (*state)[i][1]; - c = (*state)[i][2]; - d = (*state)[i][3]; - - (*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^ Multiply(d, 0x09); - (*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^ Multiply(d, 0x0d); - (*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^ Multiply(d, 0x0b); - (*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0e); - } -} - - -// The SubBytes Function Substitutes the values in the -// state matrix with values in an S-box. -static void InvSubBytes(void) -{ - uint8_t i,j; - for (i = 0; i < 4; ++i) - { - for (j = 0; j < 4; ++j) - { - (*state)[j][i] = getSBoxInvert((*state)[j][i]); - } - } -} - -static void InvShiftRows(void) -{ - uint8_t temp; - - // Rotate first row 1 columns to right - temp = (*state)[3][1]; - (*state)[3][1] = (*state)[2][1]; - (*state)[2][1] = (*state)[1][1]; - (*state)[1][1] = (*state)[0][1]; - (*state)[0][1] = temp; - - // Rotate second row 2 columns to right - temp = (*state)[0][2]; - (*state)[0][2] = (*state)[2][2]; - (*state)[2][2] = temp; - - temp = (*state)[1][2]; - (*state)[1][2] = (*state)[3][2]; - (*state)[3][2] = temp; - - // Rotate third row 3 columns to right - temp = (*state)[0][3]; - (*state)[0][3] = (*state)[1][3]; - (*state)[1][3] = (*state)[2][3]; - (*state)[2][3] = (*state)[3][3]; - (*state)[3][3] = temp; -} - - -// Cipher is the main function that encrypts the PlainText. -static void Cipher(void) -{ - uint8_t round = 0; - - // Add the First round key to the state before starting the rounds. - AddRoundKey(0); - - // There will be Nr rounds. - // The first Nr-1 rounds are identical. - // These Nr-1 rounds are executed in the loop below. - for (round = 1; round < Nr; ++round) - { - SubBytes(); - ShiftRows(); - MixColumns(); - AddRoundKey(round); - } - - // The last round is given below. - // The MixColumns function is not here in the last round. - SubBytes(); - ShiftRows(); - AddRoundKey(Nr); -} - -static void InvCipher(void) -{ - uint8_t round=0; - - // Add the First round key to the state before starting the rounds. - AddRoundKey(Nr); - - // There will be Nr rounds. - // The first Nr-1 rounds are identical. - // These Nr-1 rounds are executed in the loop below. - for (round = (Nr - 1); round > 0; --round) - { - InvShiftRows(); - InvSubBytes(); - AddRoundKey(round); - InvMixColumns(); - } - - // The last round is given below. - // The MixColumns function is not here in the last round. - InvShiftRows(); - InvSubBytes(); - AddRoundKey(0); -} - - -/*****************************************************************************/ -/* Public functions: */ -/*****************************************************************************/ -#if defined(ECB) && (ECB == 1) - - -void AES_ECB_encrypt0(const uint8_t* input, const uint8_t* key, uint8_t* output, const uint32_t length) -{ - // Copy input to output, and work in-memory on output - memcpy(output, input, length); - state = (state_t*)output; - - Key = key; - KeyExpansion(); - - // The next function call encrypts the PlainText with the Key using AES algorithm. - Cipher(); -} - -void AES_ECB_decrypt0(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length) -{ - // Copy input to output, and work in-memory on output - memcpy(output, input, length); - state = (state_t*)output; - - // The KeyExpansion routine must be called before encryption. - Key = key; - KeyExpansion(); - - InvCipher(); -} - - -#endif // #if defined(ECB) && (ECB == 1) - - - - - -#if defined(CBC) && (CBC == 1) - - -static void XorWithIv(uint8_t* buf) -{ - uint8_t i; - for (i = 0; i < BLOCKLEN; ++i) //WAS for(i = 0; i < KEYLEN; ++i) but the block in AES is always 128bit so 16 bytes! - { - buf[i] ^= Iv[i]; - } -} - -void AES_CBC_encrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv) -{ - uintptr_t i; - uint8_t extra = length % BLOCKLEN; /* Remaining bytes in the last non-full block */ - - // Skip the key expansion if key is passed as 0 - if (0 != key) - { - Key = key; - KeyExpansion(); - } - - if (iv != 0) - { - Iv = (uint8_t*)iv; - } - - for (i = 0; i < length; i += BLOCKLEN) - { - XorWithIv(input); - memcpy(output, input, BLOCKLEN); - state = (state_t*)output; - Cipher(); - Iv = output; - input += BLOCKLEN; - output += BLOCKLEN; - //printf("Step %d - %d", i/16, i); - } - - if (extra) - { - memcpy(output, input, extra); - state = (state_t*)output; - Cipher(); - } -} - -void AES_CBC_decrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv) -{ - uintptr_t i; - uint8_t extra = length % BLOCKLEN; /* Remaining bytes in the last non-full block */ - - // Skip the key expansion if key is passed as 0 - if (0 != key) - { - Key = key; - KeyExpansion(); - } - - // If iv is passed as 0, we continue to encrypt without re-setting the Iv - if (iv != 0) - { - Iv = (uint8_t*)iv; - } - - for (i = 0; i < length; i += BLOCKLEN) - { - memcpy(output, input, BLOCKLEN); - state = (state_t*)output; - InvCipher(); - XorWithIv(output); - Iv = input; - input += BLOCKLEN; - output += BLOCKLEN; - } - - if (extra) - { - memcpy(output, input, extra); - state = (state_t*)output; - InvCipher(); - } -} - -#endif // #if defined(CBC) && (CBC == 1) diff --git a/lib/aes_acc/aes0.h b/lib/aes_acc/aes0.h deleted file mode 100755 index cc693fc..0000000 --- a/lib/aes_acc/aes0.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * this file comes from https://github.com/kokke/tiny-AES128-C - */ - -#ifndef _AES_H_ -#define _AES_H_ - -#include - - -// #define the macros below to 1/0 to enable/disable the mode of operation. -// -// CBC enables AES encryption in CBC-mode of operation. -// ECB enables the basic ECB 16-byte block algorithm. Both can be enabled simultaneously. - -// The #ifndef-guard allows it to be configured before #include'ing or at compile time. -#ifndef CBC - #define CBC 1 -#endif - -#ifndef ECB - #define ECB 1 -#endif - -#define AES128 1 -//#define AES192 1 -//#define AES256 1 - -#if defined(ECB) && (ECB == 1) - -void AES_ECB_encrypt0(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length); -void AES_ECB_decrypt0(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length); - -#endif // #if defined(ECB) && (ECB == !) - - -#if defined(CBC) && (CBC == 1) - -void AES_CBC_encrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv); -void AES_CBC_decrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv); - -#endif // #if defined(CBC) && (CBC == 1) - - -#endif //_AES_H_ diff --git a/lib/aes_acc/aesacc.c b/lib/aes_acc/aesacc.c index 73c76ab..2d11896 100644 --- a/lib/aes_acc/aesacc.c +++ b/lib/aes_acc/aesacc.c @@ -2,27 +2,25 @@ * This file is adapted from PolarSSL 1.3.19 (GPL) */ -#include "aes0.h" #include "aesni.h" #include "aesarm.h" -#include "aesacc.h" - +#include #include #if defined(AES256) && (AES256 == 1) #define AES_KEYSIZE 256 #ifdef HAVE_AMD64 - #define aes_setkey_enc aesni_setkey_enc_256 + #define aeshw_setkey_enc aesni_setkey_enc_256 #endif #elif defined(AES192) && (AES192 == 1) #define AES_KEYSIZE 192 #ifdef HAVE_AMD64 - #define aes_setkey_enc aesni_setkey_enc_192 + #define aeshw_setkey_enc aesni_setkey_enc_192 #endif #else #define AES_KEYSIZE 128 #ifdef HAVE_AMD64 - #define aes_setkey_enc aesni_setkey_enc_128 + #define aeshw_setkey_enc aesni_setkey_enc_128 #endif #endif @@ -31,15 +29,15 @@ #ifdef HAVE_AMD64 #define HAVE_HARDAES 1 -#define aes_supported aesni_supported -#define aes_crypt_ecb aesni_crypt_ecb -#define aes_inverse_key(a,b) aesni_inverse_key(a,b,AES_NR) +#define aeshw_supported aesni_supported +#define aeshw_crypt_ecb aesni_crypt_ecb +#define aeshw_inverse_key(a,b) aesni_inverse_key(a,b,AES_NR) #endif /* HAVE_AMD64 */ #ifdef HAVE_ARM64 #define HAVE_HARDAES 1 -#define aes_supported aesarm_supported -#define aes_crypt_ecb aesarm_crypt_ecb +#define aeshw_supported aesarm_supported +#define aeshw_crypt_ecb aesarm_crypt_ecb #include "aesarm_table.h" @@ -53,7 +51,7 @@ } #endif -static void aes_setkey_enc(uint8_t *rk, const uint8_t *key) +static void aeshw_setkey_enc(uint8_t *rk, const uint8_t *key) { unsigned int i; uint32_t *RK; @@ -129,7 +127,7 @@ static void aes_setkey_enc(uint8_t *rk, const uint8_t *key) } } -static void aes_inverse_key(uint8_t *invkey, const uint8_t *fwdkey) +static void aeshw_inverse_key(uint8_t *invkey, const uint8_t *fwdkey) { int i, j; uint32_t *RK; @@ -159,18 +157,32 @@ static void aes_inverse_key(uint8_t *invkey, const uint8_t *fwdkey) *RK++ = *SK++; *RK++ = *SK++; } - #endif /* HAVE_ARM64 */ -#ifdef HAVE_ASM +#ifdef HAVE_HARDAES +static void aeshw_setkey_dec(uint8_t *rk, const uint8_t *key) +{ + uint8_t rk_tmp[AES_RKSIZE]; + aeshw_setkey_enc(rk_tmp, key); + aeshw_inverse_key(rk, rk_tmp); +} +#endif /* HAVE_HARDAES */ +/* OpenSSL assembly functions */ #define AES_MAXNR 14 - typedef struct { uint32_t rd_key[4 * (AES_MAXNR + 1)]; - int rounds; + uint32_t rounds; } AES_KEY; +#if defined(__amd64__) || defined(__x86_64__) || \ + defined(__aarch64__) +#define AES_set_encrypt_key vpaes_set_encrypt_key +#define AES_set_decrypt_key vpaes_set_decrypt_key +#define AES_encrypt vpaes_encrypt +#define AES_decrypt vpaes_decrypt +#endif /* VPAES for 64-bit Intel and ARM */ + #ifdef __cplusplus extern "C" { #endif @@ -189,69 +201,51 @@ void AES_decrypt(const unsigned char *in, unsigned char *out, } #endif -static int aes_supported(void) -{ - return 2; -} - static void aes_crypt_ecb( int nr, unsigned char *rk, int mode, const unsigned char input[16], unsigned char output[16] ) { - AES_KEY *ctx; - ctx = (AES_KEY *) rk; - ctx->rounds = nr; if (mode == AES_DECRYPT) { - AES_decrypt(input, output, ctx); + AES_decrypt(input, output, (AES_KEY *) rk); } else { - AES_encrypt(input, output, ctx); + AES_encrypt(input, output, (AES_KEY *) rk); } } static void aes_setkey_enc(uint8_t *rk, const uint8_t *key) { - AES_KEY *ctx; - ctx = (AES_KEY *) rk; - ctx->rounds = AES_NR; - AES_set_encrypt_key(key, AES_KEYSIZE, ctx); + AES_set_encrypt_key(key, AES_KEYSIZE, (AES_KEY *) rk); } static void aes_setkey_dec(uint8_t *rk, const uint8_t *key) { - AES_KEY *ctx; - ctx = (AES_KEY *) rk; - ctx->rounds = AES_NR; - AES_set_decrypt_key(key, AES_KEYSIZE, ctx); + AES_set_decrypt_key(key, AES_KEYSIZE, (AES_KEY *) rk); } -#endif +static void (*crypt_ecb) ( int nr, + unsigned char *rk, + int mode, + const unsigned char input[16], + unsigned char output[16] ) + = aes_crypt_ecb; -#ifdef HAVE_HARDAES +static void (*setkey_enc) (uint8_t *rk, const uint8_t *key) + = aes_setkey_enc; -static void aes_setkey_dec(uint8_t *rk, const uint8_t *key) -{ - uint8_t rk_tmp[AES_RKSIZE]; - aes_setkey_enc(rk_tmp, key); - aes_inverse_key(rk, rk_tmp); -} - -#endif - -#if defined(HAVE_HARDAES) || defined(HAVE_ASM) - -#define HAVE_ACC 1 +static void (*setkey_dec) (uint8_t *rk, const uint8_t *key) + = aes_setkey_dec; /* * AESNI-CBC buffer encryption/decryption */ -static void aes_crypt_cbc( int mode, - uint8_t* rk, - uint32_t length, - uint8_t iv[16], - const uint8_t *input, - uint8_t *output ) +static void crypt_cbc( int mode, + uint8_t* rk, + uint32_t length, + uint8_t iv[16], + const uint8_t *input, + uint8_t *output ) { int i; uint8_t temp[16]; @@ -261,7 +255,7 @@ static void aes_crypt_cbc( int mode, while( length > 0 ) { memcpy( temp, input, 16 ); - aes_crypt_ecb( AES_NR, rk, mode, input, output ); + crypt_ecb( AES_NR, rk, mode, input, output ); for( i = 0; i < 16; i++ ) output[i] = (uint8_t)( output[i] ^ iv[i] ); @@ -280,7 +274,7 @@ static void aes_crypt_cbc( int mode, for( i = 0; i < 16; i++ ) output[i] = (uint8_t)( input[i] ^ iv[i] ); - aes_crypt_ecb( AES_NR, rk, mode, output, output ); + crypt_ecb( AES_NR, rk, mode, output, output ); memcpy( iv, output, 16 ); input += 16; @@ -290,12 +284,26 @@ static void aes_crypt_cbc( int mode, } } -#endif /* HAVE_HARDAES or HAVE_ASM */ -int AESACC_supported(void) +static void aeshw_init(void) { -#if defined(HAVE_ACC) - return aes_supported(); +#ifdef HAVE_HARDAES + static int done = 0; + if (!done) { + if (aeshw_supported()) { + crypt_ecb = aeshw_crypt_ecb; + setkey_enc = aeshw_setkey_enc; + setkey_dec = aeshw_setkey_dec; + } + done = 1; + } +#endif +} + +int AES_support_hwaccel(void) +{ +#ifdef HAVE_HARDAES + return aeshw_supported(); #else return 0; #endif @@ -303,86 +311,59 @@ int AESACC_supported(void) void AES_CBC_encrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv) { -#if defined(HAVE_ACC) uint8_t iv_tmp[16]; uint8_t rk[AES_RKSIZE]; - if (aes_supported()) + if (key == NULL || iv == NULL) { - if (key == NULL || iv == NULL) - { - return; - } - memcpy(iv_tmp, iv, 16); - aes_setkey_enc(rk, key); - aes_crypt_cbc(AES_ENCRYPT, rk, \ - length, iv_tmp, input, output); return; } -#endif - - AES_CBC_encrypt_buffer0(output, input, length, key, iv); + aeshw_init(); + memcpy(iv_tmp, iv, 16); + setkey_enc(rk, key); + crypt_cbc(AES_ENCRYPT, rk, \ + length, iv_tmp, input, output); } void AES_CBC_decrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv) { -#if defined(HAVE_ACC) uint8_t iv_tmp[16]; uint8_t rk[AES_RKSIZE]; - if (aes_supported()) + if (key == NULL || iv == NULL) { - if (key == NULL || iv == NULL) - { - return; - } - memcpy(iv_tmp, iv, 16); - aes_setkey_dec(rk, key); - aes_crypt_cbc(AES_DECRYPT, rk, \ - length, iv_tmp, input, output); return; } -#endif + aeshw_init(); + memcpy(iv_tmp, iv, 16); + setkey_dec(rk, key); + crypt_cbc(AES_DECRYPT, rk, \ + length, iv_tmp, input, output); - AES_CBC_decrypt_buffer0(output, input, length, key, iv); } void AES_ECB_encrypt(const uint8_t* input, const uint8_t* key, uint8_t* output, const uint32_t length) { -#if defined(HAVE_ACC) uint8_t rk[AES_RKSIZE]; - if (aes_supported()) + if (key == NULL) { - if (key == NULL) - { - return; - } - aes_setkey_enc(rk, key); - aes_crypt_ecb(AES_NR, rk, AES_ENCRYPT, input, output); return; } -#endif - - AES_ECB_encrypt0(input, key, output, length); + aeshw_init(); + setkey_enc(rk, key); + crypt_ecb(AES_NR, rk, AES_ENCRYPT, input, output); } void AES_ECB_decrypt(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length) { -#if defined(HAVE_ACC) uint8_t rk[AES_RKSIZE]; - if (aes_supported()) + if (key == NULL) { - if (key == NULL) - { - return; - } - aes_setkey_dec(rk, key); - aes_crypt_ecb(AES_NR, rk, AES_DECRYPT, input, output); return; } -#endif - - AES_ECB_decrypt0(input, key, output, length); + aeshw_init(); + setkey_dec(rk, key); + crypt_ecb(AES_NR, rk, AES_DECRYPT, input, output); } diff --git a/lib/aes_acc/aesacc.h b/lib/aes_acc/aesacc.h deleted file mode 100644 index afc36d0..0000000 --- a/lib/aes_acc/aesacc.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _AESACC_H_ -#define _AESACC_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -int AESACC_supported(void); -void AESACC_ECB_encrypt(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length); -void AESACC_ECB_decrypt(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length); -void AESACC_CBC_encrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv); -void AESACC_CBC_decrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv); - -#ifdef __cplusplus -} -#endif - -#endif /* _AESACC_H_ */ diff --git a/lib/aes_acc/aesni.c b/lib/aes_acc/aesni.c index 66e6cb8..85aa22b 100644 --- a/lib/aes_acc/aesni.c +++ b/lib/aes_acc/aesni.c @@ -86,11 +86,11 @@ int aesni_supported( void ) /* * AES-NI AES-ECB block en(de)cryption */ -int aesni_crypt_ecb( int nr, - unsigned char *rk, - int mode, - const unsigned char input[16], - unsigned char output[16] ) +void aesni_crypt_ecb( int nr, + unsigned char *rk, + int mode, + const unsigned char input[16], + unsigned char output[16] ) { asm( "movdqu (%3), %%xmm0 \n\t" // load input "movdqu (%1), %%xmm1 \n\t" // load round key 0 @@ -124,9 +124,6 @@ int aesni_crypt_ecb( int nr, : : "r" (nr), "r" (rk), "r" (mode), "r" (input), "r" (output) : "memory", "cc", "xmm0", "xmm1" ); - - - return( 0 ); } /* diff --git a/lib/aes_acc/aesni.h b/lib/aes_acc/aesni.h index 42b6aa1..b106b9c 100644 --- a/lib/aes_acc/aesni.h +++ b/lib/aes_acc/aesni.h @@ -64,14 +64,12 @@ int aesni_supported( void ); * \param mode AES_ENCRYPT or AES_DECRYPT * \param input 16-byte input block * \param output 16-byte output block - * - * \return 0 on success (cannot fail) */ -int aesni_crypt_ecb( int nr, - unsigned char *rk, - int mode, - const unsigned char input[16], - unsigned char output[16] ); +void aesni_crypt_ecb( int nr, + unsigned char *rk, + int mode, + const unsigned char input[16], + unsigned char output[16] ); /** * \brief Compute decryption round keys from encryption round keys diff --git a/lib/aes_acc/asm/arm64.S b/lib/aes_acc/asm/arm64.S new file mode 100644 index 0000000..805b78c --- /dev/null +++ b/lib/aes_acc/asm/arm64.S @@ -0,0 +1,1178 @@ +.text + +.type _vpaes_consts,%object +.align 7 // totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +.Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +.Lk_dipt: // decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +.Lk_dsbo: // decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.Lk_dsb9: // decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: // decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: // decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: // decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +.Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 +.size _vpaes_consts,.-_vpaes_consts +.align 6 +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_encrypt_preheat,%function +.align 4 +_vpaes_encrypt_preheat: + adr x10, .Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ret +.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adr x11, .Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Lenc_entry + +.align 4 +.Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +.Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl vpaes_encrypt +.type vpaes_encrypt,%function +.align 4 +vpaes_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + ret +.size vpaes_encrypt,.-vpaes_encrypt + +.type _vpaes_encrypt_2x,%function +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adr x11, .Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Lenc_2x_entry + +.align 4 +.Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +.Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret +.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x + +.type _vpaes_decrypt_preheat,%function +.align 4 +_vpaes_decrypt_preheat: + adr x10, .Lk_inv + movi v17.16b, #0x0f + adr x11, .Lk_dipt + ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe + ret +.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat + +## +## Decryption core +## +## Same API as encryption core. +## +.type _vpaes_decrypt_core,%function +.align 4 +_vpaes_decrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adr x10, .Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adr x10, .Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Ldec_entry + +.align 4 +.Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub $1,%rax # nr-- + +.Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +.globl vpaes_decrypt +.type vpaes_decrypt,%function +.align 4 +vpaes_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + ret +.size vpaes_decrypt,.-vpaes_decrypt + +// v14-v15 input, v0-v1 output +.type _vpaes_decrypt_2x,%function +.align 4 +_vpaes_decrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adr x10, .Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adr x10, .Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {v20.16b},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {v21.16b},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Ldec_2x_entry + +.align 4 +.Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {v24.16b}, v10.16b + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {v25.16b}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {v26.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {v27.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {v28.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {v29.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {v30.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {v31.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub $1,%rax # nr-- + +.Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {v23.16b}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret +.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adr x10, .Lk_inv + movi v16.16b, #0x5b // .Lk_s63 + adr x11, .Lk_sb1 + movi v17.16b, #0x0f // .Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt + adr x10, .Lk_dksd + ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 + adr x11, .Lk_mc_forward + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [x10] // .Lk_rcon + ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ret +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10 + add x8, x8, x10 + cbnz w3, .Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + b .Lschedule_go + +.Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor $0x30, %r8 + +.Lschedule_go: + cmp w1, #192 // cmp $192, %esi + b.hi .Lschedule_256 + b.eq .Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov x0, #10 // mov $10, %esi + +.Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +.Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +.Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +.Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +.Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +.Lschedule_mangle_last: + // schedule last round key from xmm0 + adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + cbnz w3, .Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +.Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz w3, .Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b .Lschedule_mangle_both +.align 4 +.Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub x2, x2, #16 // add $-16, %rdx + +.Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #64-16 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,%function +.align 4 +vpaes_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl $4,%eax + add x2, x2, #16 // lea 16(%rdx,%rax),%rdx + add x2, x2, x9 + + mov w3, #1 // mov $1,%ecx + lsr w8, w1, #1 // shr $1,%r8d + and x8, x8, #32 // and $32,%r8d + eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key +.globl vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,%function +.align 4 +vpaes_cbc_encrypt: + cbz x2, .Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, x2 // reassign + mov x2, x3 // reassign + + ld1 {v0.16b}, [x4] // load ivec + bl _vpaes_encrypt_preheat + b .Lcbc_enc_loop + +.align 4 +.Lcbc_enc_loop: + ld1 {v7.16b}, [x0],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 // save output + subs x17, x17, #16 + b.hi .Lcbc_enc_loop + + st1 {v0.16b}, [x4] // write ivec + + ldp x29,x30,[sp],#16 +.Lcbc_abort: + ret +.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt + +.type vpaes_cbc_decrypt,%function +.align 4 +vpaes_cbc_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 // reassign + mov x2, x3 // reassign + ld1 {v6.16b}, [x4] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq .Lcbc_dec_loop2x + + ld1 {v7.16b}, [x0], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [x1], #16 + subs x17, x17, #16 + b.ls .Lcbc_dec_done + +.align 4 +.Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi .Lcbc_dec_loop2x + +.Lcbc_dec_done: + st1 {v6.16b}, [x4] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt +.globl vpaes_ecb_encrypt +.type vpaes_ecb_encrypt,%function +.align 4 +vpaes_ecb_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 + mov x2, x3 + bl _vpaes_encrypt_preheat + tst x17, #16 + b.eq .Lecb_enc_loop + + ld1 {v7.16b}, [x0],#16 + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 + subs x17, x17, #16 + b.ls .Lecb_enc_done + +.align 4 +.Lecb_enc_loop: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_encrypt_2x + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi .Lecb_enc_loop + +.Lecb_enc_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt + +.globl vpaes_ecb_decrypt +.type vpaes_ecb_decrypt,%function +.align 4 +vpaes_ecb_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 + mov x2, x3 + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq .Lecb_dec_loop + + ld1 {v7.16b}, [x0],#16 + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 + subs x17, x17, #16 + b.ls .Lecb_dec_done + +.align 4 +.Lecb_dec_loop: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_decrypt_2x + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi .Lecb_dec_loop + +.Lecb_dec_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt diff --git a/lib/aes_acc/asm/x64.S b/lib/aes_acc/asm/x64.S new file mode 100644 index 0000000..d193298 --- /dev/null +++ b/lib/aes_acc/asm/x64.S @@ -0,0 +1,827 @@ +.text + + + + + + + + + + + + + + + + +.type _vpaes_encrypt_core,@function +.align 16 +_vpaes_encrypt_core: + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_ipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movdqu (%r9),%xmm5 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_ipt+16(%rip),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm5,%xmm2 + addq $16,%r9 + pxor %xmm2,%xmm0 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.align 16 +.Lenc_loop: + + movdqa %xmm13,%xmm4 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,226 +.byte 102,15,56,0,195 + pxor %xmm5,%xmm4 + movdqa %xmm15,%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%r11,%r10,1),%xmm1 +.byte 102,15,56,0,234 + movdqa (%r11,%r10,1),%xmm4 + movdqa %xmm14,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 +.byte 102,15,56,0,193 + addq $16,%r9 + pxor %xmm2,%xmm0 +.byte 102,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,193 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + +.Lenc_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,232 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 +.byte 102,15,56,0,224 + movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm5 + pxor %xmm1,%xmm3 + jnz .Lenc_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 +.byte 102,15,56,0,226 + pxor %xmm5,%xmm4 +.byte 102,15,56,0,195 + movdqa 64(%r11,%r10,1),%xmm1 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,193 + .byte 0xf3,0xc3 +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + + + + + + +.type _vpaes_decrypt_core,@function +.align 16 +_vpaes_decrypt_core: + movq %rdx,%r9 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_dipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movq %rax,%r11 + psrld $4,%xmm1 + movdqu (%r9),%xmm5 + shlq $4,%r11 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa .Lk_dipt+16(%rip),%xmm0 + xorq $0x30,%r11 + leaq .Lk_dsbd(%rip),%r10 +.byte 102,15,56,0,193 + andq $0x30,%r11 + pxor %xmm5,%xmm2 + movdqa .Lk_mc_forward+48(%rip),%xmm5 + pxor %xmm2,%xmm0 + addq $16,%r9 + addq %r10,%r11 + jmp .Ldec_entry + +.align 16 +.Ldec_loop: + + + + movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 +.byte 102,15,56,0,226 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 0(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + +.byte 102,15,56,0,226 +.byte 102,15,56,0,197 +.byte 102,15,56,0,203 + pxor %xmm4,%xmm0 + addq $16,%r9 +.byte 102,15,58,15,237,12 + pxor %xmm1,%xmm0 + subq $1,%rax + +.Ldec_entry: + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 + psrld $4,%xmm1 + pand %xmm9,%xmm0 +.byte 102,15,56,0,208 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,217 + movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 +.byte 102,15,56,0,220 + movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 + jnz .Ldec_loop + + + movdqa 96(%r10),%xmm4 +.byte 102,15,56,0,226 + pxor %xmm0,%xmm4 + movdqa 112(%r10),%xmm0 + movdqa -352(%r11),%xmm2 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 +.byte 102,15,56,0,194 + .byte 0xf3,0xc3 +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + + + + + + +.type _vpaes_schedule_core,@function +.align 16 +_vpaes_schedule_core: + + + + + + call _vpaes_preheat + movdqa .Lk_rcon(%rip),%xmm8 + movdqu (%rdi),%xmm0 + + + movdqa %xmm0,%xmm3 + leaq .Lk_ipt(%rip),%r11 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + + leaq .Lk_sr(%rip),%r10 + testq %rcx,%rcx + jnz .Lschedule_am_decrypting + + + movdqu %xmm0,(%rdx) + jmp .Lschedule_go + +.Lschedule_am_decrypting: + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + movdqu %xmm3,(%rdx) + xorq $0x30,%r8 + +.Lschedule_go: + cmpl $192,%esi + ja .Lschedule_256 + je .Lschedule_192 + + + + + + + + + + +.Lschedule_128: + movl $10,%esi + +.Loop_schedule_128: + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + jmp .Loop_schedule_128 + + + + + + + + + + + + + + + + +.align 16 +.Lschedule_192: + movdqu 8(%rdi),%xmm0 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%esi + +.Loop_schedule_192: + call _vpaes_schedule_round +.byte 102,15,58,15,198,8 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp .Loop_schedule_192 + + + + + + + + + + + +.align 16 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 + call _vpaes_schedule_transform + movl $7,%esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + + + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd $0xFF,%xmm0,%xmm0 + movdqa %xmm7,%xmm5 + movdqa %xmm6,%xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5,%xmm7 + + jmp .Loop_schedule_256 + + + + + + + + + + + + +.align 16 +.Lschedule_mangle_last: + + leaq .Lk_deskew(%rip),%r11 + testq %rcx,%rcx + jnz .Lschedule_mangle_last_dec + + + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,193 + leaq .Lk_opt(%rip),%r11 + addq $32,%rdx + +.Lschedule_mangle_last_dec: + addq $-16,%rdx + pxor .Lk_s63(%rip),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%rdx) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + .byte 0xf3,0xc3 +.size _vpaes_schedule_core,.-_vpaes_schedule_core + + + + + + + + + + + + + + + +.type _vpaes_schedule_192_smear,@function +.align 16 +_vpaes_schedule_192_smear: + pshufd $0x80,%xmm6,%xmm1 + pshufd $0xFE,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + .byte 0xf3,0xc3 +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + + + + + + + + + + + + + + + + + + + +.type _vpaes_schedule_round,@function +.align 16 +_vpaes_schedule_round: + + pxor %xmm1,%xmm1 +.byte 102,65,15,58,15,200,15 +.byte 102,69,15,58,15,192,15 + pxor %xmm1,%xmm7 + + + pshufd $0xFF,%xmm0,%xmm0 +.byte 102,15,58,15,192,1 + + + + +_vpaes_schedule_low_round: + + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor .Lk_s63(%rip),%xmm7 + + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa %xmm11,%xmm2 +.byte 102,15,56,0,208 + pxor %xmm1,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + movdqa %xmm10,%xmm4 +.byte 102,15,56,0,224 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 +.byte 102,15,56,0,211 + pxor %xmm0,%xmm2 + movdqa %xmm10,%xmm3 +.byte 102,15,56,0,220 + pxor %xmm1,%xmm3 + movdqa %xmm13,%xmm4 +.byte 102,15,56,0,226 + movdqa %xmm12,%xmm0 +.byte 102,15,56,0,195 + pxor %xmm4,%xmm0 + + + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + .byte 0xf3,0xc3 +.size _vpaes_schedule_round,.-_vpaes_schedule_round + + + + + + + + + + +.type _vpaes_schedule_transform,@function +.align 16 +_vpaes_schedule_transform: + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa (%r11),%xmm2 +.byte 102,15,56,0,208 + movdqa 16(%r11),%xmm0 +.byte 102,15,56,0,193 + pxor %xmm2,%xmm0 + .byte 0xf3,0xc3 +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + + + + + + + + + + + + + + + + + + + + + + + + +.type _vpaes_schedule_mangle,@function +.align 16 +_vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa .Lk_mc_forward(%rip),%xmm5 + testq %rcx,%rcx + jnz .Lschedule_mangle_dec + + + addq $16,%rdx + pxor .Lk_s63(%rip),%xmm4 +.byte 102,15,56,0,229 + movdqa %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 +.byte 102,15,56,0,229 + pxor %xmm4,%xmm3 + + jmp .Lschedule_mangle_both +.align 16 +.Lschedule_mangle_dec: + + leaq .Lk_dksd(%rip),%r11 + movdqa %xmm9,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm4 + + movdqa 0(%r11),%xmm2 +.byte 102,15,56,0,212 + movdqa 16(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 32(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 48(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 64(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 80(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 +.byte 102,15,56,0,221 + + movdqa 96(%r11),%xmm2 +.byte 102,15,56,0,212 + pxor %xmm3,%xmm2 + movdqa 112(%r11),%xmm3 +.byte 102,15,56,0,217 + pxor %xmm2,%xmm3 + + addq $-16,%rdx + +.Lschedule_mangle_both: + movdqa (%r8,%r10,1),%xmm1 +.byte 102,15,56,0,217 + addq $-16,%r8 + andq $0x30,%r8 + movdqu %xmm3,(%rdx) + .byte 0xf3,0xc3 +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + + + + +.globl vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,@function +.align 16 +vpaes_set_encrypt_key: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + + movl $0,%ecx + movl $0x30,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + .byte 0xf3,0xc3 +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,@function +.align 16 +vpaes_set_decrypt_key: + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + shll $4,%eax + leaq 16(%rdx,%rax,1),%rdx + + movl $1,%ecx + movl %esi,%r8d + shrl $1,%r8d + andl $32,%r8d + xorl $32,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + .byte 0xf3,0xc3 +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key + +.globl vpaes_encrypt +.type vpaes_encrypt,@function +.align 16 +vpaes_encrypt: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) + .byte 0xf3,0xc3 +.size vpaes_encrypt,.-vpaes_encrypt + +.globl vpaes_decrypt +.type vpaes_decrypt,@function +.align 16 +vpaes_decrypt: + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) + .byte 0xf3,0xc3 +.size vpaes_decrypt,.-vpaes_decrypt +.globl vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,@function +.align 16 +vpaes_cbc_encrypt: + xchgq %rcx,%rdx + subq $16,%rcx + jc .Lcbc_abort + movdqu (%r8),%xmm6 + subq %rdi,%rsi + call _vpaes_preheat + cmpl $0,%r9d + je .Lcbc_dec_loop + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movdqu (%rdi),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_enc_loop + jmp .Lcbc_done +.align 16 +.Lcbc_dec_loop: + movdqu (%rdi),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_dec_loop +.Lcbc_done: + movdqu %xmm6,(%r8) +.Lcbc_abort: + .byte 0xf3,0xc3 +.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt + + + + + + +.type _vpaes_preheat,@function +.align 16 +_vpaes_preheat: + leaq .Lk_s0F(%rip),%r10 + movdqa -32(%r10),%xmm10 + movdqa -16(%r10),%xmm11 + movdqa 0(%r10),%xmm9 + movdqa 48(%r10),%xmm13 + movdqa 64(%r10),%xmm12 + movdqa 80(%r10),%xmm15 + movdqa 96(%r10),%xmm14 + .byte 0xf3,0xc3 +.size _vpaes_preheat,.-_vpaes_preheat + + + + + +.type _vpaes_consts,@object +.align 64 +_vpaes_consts: +.Lk_inv: +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: +.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward: +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: +.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + + + + +.Lk_dksd: +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + + + + + +.Lk_dipt: +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 + +.Lk_dsb9: +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.Lk_dsbo: +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 64 +.size _vpaes_consts,.-_vpaes_consts diff --git a/makefile b/makefile index 4574c8b..54a80f4 100755 --- a/makefile +++ b/makefile @@ -6,7 +6,7 @@ cc_arm=/home/wangyu/Desktop/arm-2014.05/bin/arm-none-linux-gnueabi-g++ FLAGS= -std=c++11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers SOURCES=main.cpp lib/aes.c lib/md5.c encrypt.cpp log.cpp network.cpp common.cpp -SOURCES_AES_ACC=main.cpp $(wildcard lib/aes_acc/aes*.c) lib/md5.c encrypt.cpp log.cpp network.cpp common.cpp +SOURCES_AES_ACC=$(filter-out lib/aes.c,$(SOURCES)) $(wildcard lib/aes_acc/aes*.c) NAME=udp2raw TAR=${NAME}_binaries.tar.gz ${NAME}_amd64 ${NAME}_x86 ${NAME}_x86_asm_aes ${NAME}_ar71xx ${NAME}_bcm2708 ${NAME}_arm ${NAME}_amd64_hw_aes ${NAME}_arm_asm_aes @@ -32,16 +32,16 @@ bcm2708: amd64: ${cc_local} -o ${NAME}_amd64 -I. ${SOURCES} ${FLAGS} -lrt -static -O3 amd64_hw_aes: - ${cc_local} -o ${NAME}_amd64_hw_aes -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 + ${cc_local} -o ${NAME}_amd64_hw_aes -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 lib/aes_acc/asm/x64.S x86: ${cc_local} -o ${NAME}_x86 -I. ${SOURCES} ${FLAGS} -lrt -static -O3 -m32 x86_asm_aes: - ${cc_local} -o ${NAME}_x86_asm_aes -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 -m32 -DHAVE_ASM lib/aes_acc/asm/x86.S + ${cc_local} -o ${NAME}_x86_asm_aes -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 -m32 lib/aes_acc/asm/x86.S arm: ${cc_cross} -o ${NAME}_arm -I. ${SOURCES} ${FLAGS} -lrt -static -O3 arm_asm_aes: - ${cc_cross} -o ${NAME}_arm_asm_aes -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 -DHAVE_ASM lib/aes_acc/asm/arm.S + ${cc_cross} -o ${NAME}_arm_asm_aes -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 lib/aes_acc/asm/arm.S cross: ${cc_cross} -o ${NAME}_cross -I. ${SOURCES} ${FLAGS} -lrt -O3