From 88959e05e6f7d9c7ec78338a83faed8fae0c43b5 Mon Sep 17 00:00:00 2001 From: Christopher Berner Date: Sat, 16 Oct 2021 17:37:01 -0700 Subject: [PATCH] Use vshrq_n_u8 in neon optimizations Now that https://github.com/rust-lang/rust/issues/82072 is fixed this intrinsic works and improves mulassign & FMA performance by ~30% on Raspberry Pi 3 B+. End to end speedup is ~5% --- src/octets.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/octets.rs b/src/octets.rs index 1b2de7d..4ecd3dc 100644 --- a/src/octets.rs +++ b/src/octets.rs @@ -290,11 +290,7 @@ unsafe fn mulassign_scalar_neon(octets: &mut [u8], scalar: &Octet) { let self_vec = vld1q_u8(self_neon_ptr.add(i * mem::size_of::())); let low = vandq_u8(self_vec, mask); let low_result = vqtbl1q_u8(low_table, low); - // TODO: replace with vshrq. Right now it doesn't optimize correctly (it generates 16 ubfx instructions) - // let hi = vshrq_n_u8(self_vec, 4); - let mut hi: u128 = mem::transmute(self_vec); - hi >>= 4; - let hi: uint8x16_t = mem::transmute(hi); + let hi = vshrq_n_u8(self_vec, 4); let hi = vandq_u8(hi, mask); let hi_result = vqtbl1q_u8(hi_table, hi); let result = veorq_u8(hi_result, low_result); @@ -463,11 +459,7 @@ unsafe fn fused_addassign_mul_scalar_neon(octets: &mut [u8], other: &[u8], scala let other_vec = vld1q_u8(other_neon_ptr.add(i * mem::size_of::())); let low = vandq_u8(other_vec, mask); let low_result = vqtbl1q_u8(low_table, low); - // TODO: replace with vshrq. Right now it doesn't optimize correctly (it generates 16 ubfx instructions) - // let hi = vshrq_n_u8(other_vec, 4); - let mut hi: u128 = mem::transmute(other_vec); - hi >>= 4; - let hi: uint8x16_t = mem::transmute(hi); + let hi = vshrq_n_u8(other_vec, 4); let hi = vandq_u8(hi, mask); let hi_result = vqtbl1q_u8(hi_table, hi); let other_vec = veorq_u8(hi_result, low_result);