mirror of
https://github.com/cberner/raptorq.git
synced 2024-06-29 18:21:43 +00:00
Use vshrq_n_u8 in neon optimizations
Now that https://github.com/rust-lang/rust/issues/82072 is fixed this intrinsic works and improves mulassign & FMA performance by ~30% on Raspberry Pi 3 B+. End to end speedup is ~5%
This commit is contained in:
parent
1a4a62e64a
commit
88959e05e6
@ -290,11 +290,7 @@ unsafe fn mulassign_scalar_neon(octets: &mut [u8], scalar: &Octet) {
|
||||
let self_vec = vld1q_u8(self_neon_ptr.add(i * mem::size_of::<uint8x16_t>()));
|
||||
let low = vandq_u8(self_vec, mask);
|
||||
let low_result = vqtbl1q_u8(low_table, low);
|
||||
// TODO: replace with vshrq. Right now it doesn't optimize correctly (it generates 16 ubfx instructions)
|
||||
// let hi = vshrq_n_u8(self_vec, 4);
|
||||
let mut hi: u128 = mem::transmute(self_vec);
|
||||
hi >>= 4;
|
||||
let hi: uint8x16_t = mem::transmute(hi);
|
||||
let hi = vshrq_n_u8(self_vec, 4);
|
||||
let hi = vandq_u8(hi, mask);
|
||||
let hi_result = vqtbl1q_u8(hi_table, hi);
|
||||
let result = veorq_u8(hi_result, low_result);
|
||||
@ -463,11 +459,7 @@ unsafe fn fused_addassign_mul_scalar_neon(octets: &mut [u8], other: &[u8], scala
|
||||
let other_vec = vld1q_u8(other_neon_ptr.add(i * mem::size_of::<uint8x16_t>()));
|
||||
let low = vandq_u8(other_vec, mask);
|
||||
let low_result = vqtbl1q_u8(low_table, low);
|
||||
// TODO: replace with vshrq. Right now it doesn't optimize correctly (it generates 16 ubfx instructions)
|
||||
// let hi = vshrq_n_u8(other_vec, 4);
|
||||
let mut hi: u128 = mem::transmute(other_vec);
|
||||
hi >>= 4;
|
||||
let hi: uint8x16_t = mem::transmute(hi);
|
||||
let hi = vshrq_n_u8(other_vec, 4);
|
||||
let hi = vandq_u8(hi, mask);
|
||||
let hi_result = vqtbl1q_u8(hi_table, hi);
|
||||
let other_vec = veorq_u8(hi_result, low_result);
|
||||
|
Loading…
Reference in New Issue
Block a user