From 88959e05e6f7d9c7ec78338a83faed8fae0c43b5 Mon Sep 17 00:00:00 2001
From: Christopher Berner <christopherberner@gmail.com>
Date: Sat, 16 Oct 2021 17:37:01 -0700
Subject: [PATCH] Use vshrq_n_u8 in neon optimizations

Now that https://github.com/rust-lang/rust/issues/82072 is fixed this
intrinsic works and improves mulassign & FMA performance by ~30% on
Raspberry Pi 3 B+. End to end speedup is ~5%
---
 src/octets.rs | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/octets.rs b/src/octets.rs
index 1b2de7d..4ecd3dc 100644
--- a/src/octets.rs
+++ b/src/octets.rs
@@ -290,11 +290,7 @@ unsafe fn mulassign_scalar_neon(octets: &mut [u8], scalar: &Octet) {
         let self_vec = vld1q_u8(self_neon_ptr.add(i * mem::size_of::<uint8x16_t>()));
         let low = vandq_u8(self_vec, mask);
         let low_result = vqtbl1q_u8(low_table, low);
-        // TODO: replace with vshrq. Right now it doesn't optimize correctly (it generates 16 ubfx instructions)
-        // let hi = vshrq_n_u8(self_vec, 4);
-        let mut hi: u128 = mem::transmute(self_vec);
-        hi >>= 4;
-        let hi: uint8x16_t = mem::transmute(hi);
+        let hi = vshrq_n_u8(self_vec, 4);
         let hi = vandq_u8(hi, mask);
         let hi_result = vqtbl1q_u8(hi_table, hi);
         let result = veorq_u8(hi_result, low_result);
@@ -463,11 +459,7 @@ unsafe fn fused_addassign_mul_scalar_neon(octets: &mut [u8], other: &[u8], scala
         let other_vec = vld1q_u8(other_neon_ptr.add(i * mem::size_of::<uint8x16_t>()));
         let low = vandq_u8(other_vec, mask);
         let low_result = vqtbl1q_u8(low_table, low);
-        // TODO: replace with vshrq. Right now it doesn't optimize correctly (it generates 16 ubfx instructions)
-        // let hi = vshrq_n_u8(other_vec, 4);
-        let mut hi: u128 = mem::transmute(other_vec);
-        hi >>= 4;
-        let hi: uint8x16_t = mem::transmute(hi);
+        let hi = vshrq_n_u8(other_vec, 4);
         let hi = vandq_u8(hi, mask);
         let hi_result = vqtbl1q_u8(hi_table, hi);
         let other_vec = veorq_u8(hi_result, low_result);