Add NEON optimized implementation for octets::add_assign()

2024-06-27 09:19:02 +00:00 · 2021-02-08 22:27:42 -08:00 · 2021-02-08 22:27:42 -08:00 · e3e9d6dcc2
commit e3e9d6dcc2
parent c1fa4e1f8e
4 changed files with 87 additions and 2 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -44,6 +44,7 @@ lto = false

 [features]
 benchmarking = []
+use_neon = []
 python = ["pyo3"]
 serde_support = ["serde"]

--- a/2
+++ b/2
@ -10,7 +10,7 @@ release: pre
 	cargo build --release

 test: pre
-	cargo build --all-features
+	cargo build --features benchmarking,python,serde_support
 	cargo test --features benchmarking

 test_extended: pre
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,4 +1,7 @@
 #![allow(clippy::needless_return, clippy::unreadable_literal)]
+#![cfg_attr(feature = "use_neon", feature(stdsimd))]
+#![cfg_attr(feature = "use_neon", feature(aarch64_target_feature))]
+#![cfg_attr(feature = "use_neon", feature(arm_target_feature))]

 mod arraymap;
 mod base;
--- a/src/octets.rs
+++ b/src/octets.rs
@ -436,6 +436,72 @@ fn add_assign_fallback(octets: &mut [u8], other: &[u8]) {
    }
 }

+#[cfg(all(target_arch = "aarch64", feature = "use_neon"))]
+use std::arch::aarch64::uint8x16_t;
+#[cfg(all(target_arch = "arm", feature = "use_neon"))]
+use std::arch::arm::uint8x16_t;
+
+#[cfg(all(
+    any(target_arch = "arm", target_arch = "aarch64"),
+    feature = "use_neon"
+))]
+#[target_feature(enable = "neon")]
+unsafe fn store_neon(ptr: *mut uint8x16_t, value: uint8x16_t) {
+    #[cfg(target_arch = "aarch64")]
+    use std::arch::aarch64::*;
+    #[cfg(target_arch = "arm")]
+    use std::arch::arm::*;
+
+    // TODO: replace with vst1q_u8 when it's supported
+    let reinterp = vreinterpretq_u64_u8(value);
+    *(ptr as *mut u64) = vgetq_lane_u64(reinterp, 0);
+    *(ptr as *mut u64).add(1) = vgetq_lane_u64(reinterp, 1);
+}
+
+#[cfg(all(
+    any(target_arch = "arm", target_arch = "aarch64"),
+    feature = "use_neon"
+))]
+#[target_feature(enable = "neon")]
+unsafe fn add_assign_neon(octets: &mut [u8], other: &[u8]) {
+    #[cfg(target_arch = "aarch64")]
+    use std::arch::aarch64::*;
+    #[cfg(target_arch = "arm")]
+    use std::arch::arm::*;
+    use std::mem;
+
+    assert_eq!(octets.len(), other.len());
+    let self_neon_ptr = octets.as_mut_ptr();
+    let other_neon_ptr = other.as_ptr();
+    for i in 0..(octets.len() / 16) {
+        #[allow(clippy::cast_ptr_alignment)]
+        let self_vec = vld1q_u8(self_neon_ptr.add(i * mem::size_of::<uint8x16_t>()));
+        #[allow(clippy::cast_ptr_alignment)]
+        let other_vec = vld1q_u8(other_neon_ptr.add(i * mem::size_of::<uint8x16_t>()));
+        let result = veorq_u8(self_vec, other_vec);
+        #[allow(clippy::cast_ptr_alignment)]
+        store_neon((self_neon_ptr as *mut uint8x16_t).add(i), result);
+    }
+
+    let remainder = octets.len() % 16;
+    let self_ptr = octets.as_mut_ptr();
+    let other_ptr = other.as_ptr();
+    for i in ((octets.len() - remainder) / 8)..(octets.len() / 8) {
+        #[allow(clippy::cast_ptr_alignment)]
+        let self_value = (self_ptr as *mut u64).add(i).read_unaligned();
+        #[allow(clippy::cast_ptr_alignment)]
+        let other_value = (other_ptr as *mut u64).add(i).read_unaligned();
+        let result = self_value ^ other_value;
+        #[allow(clippy::cast_ptr_alignment)]
+        (self_ptr as *mut u64).add(i).write_unaligned(result);
+    }
+
+    let remainder = octets.len() % 8;
+    for i in (octets.len() - remainder)..octets.len() {
+        *octets.get_unchecked_mut(i) ^= other.get_unchecked(i);
+    }
+}
+
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[target_feature(enable = "avx2")]
 unsafe fn add_assign_avx2(octets: &mut [u8], other: &[u8]) {
@ -530,7 +596,22 @@ pub fn add_assign(octets: &mut [u8], other: &[u8]) {
            }
        }
    }
-
+    #[cfg(all(target_arch = "aarch64", feature = "use_neon"))]
+    {
+        if is_aarch64_feature_detected!("neon") {
+            unsafe {
+                return add_assign_neon(octets, other);
+            }
+        }
+    }
+    #[cfg(all(target_arch = "arm", feature = "use_neon"))]
+    {
+        if is_arm_feature_detected!("neon") {
+            unsafe {
+                return add_assign_neon(octets, other);
+            }
+        }
+    }
    return add_assign_fallback(octets, other);
 }