Enable NEON optimized code path on aarch64

This commit is contained in:
Christopher Berner 2022-05-15 20:24:34 -07:00
parent 28136b2d39
commit 9a47489160
4 changed files with 73 additions and 64 deletions

@ -7,7 +7,7 @@ repository = "https://github.com/cberner/raptorq"
readme = "README.md" readme = "README.md"
version = "1.6.5" version = "1.6.5"
edition = "2021" edition = "2021"
rust-version = "1.56" rust-version = "1.60"
authors = ["Christopher Berner <christopherberner@gmail.com>"] authors = ["Christopher Berner <christopherberner@gmail.com>"]
[lib] [lib]
@ -45,7 +45,6 @@ lto = false
[features] [features]
benchmarking = [] benchmarking = []
use_neon = []
python = ["pyo3"] python = ["pyo3"]
serde_support = ["serde"] serde_support = ["serde"]

@ -2,7 +2,7 @@
cd /raptorq cd /raptorq
yum install -y python3-pip yum install -y python3-pip
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain=1.46.0 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain=1.60.0
source $HOME/.cargo/env source $HOME/.cargo/env
pip3 install toml pip3 install toml

@ -1,7 +1,4 @@
#![allow(clippy::needless_return, clippy::unreadable_literal)] #![allow(clippy::needless_return, clippy::unreadable_literal)]
#![cfg_attr(feature = "use_neon", feature(stdsimd))]
#![cfg_attr(feature = "use_neon", feature(aarch64_target_feature))]
#![cfg_attr(feature = "use_neon", feature(arm_target_feature))]
mod arraymap; mod arraymap;
mod base; mod base;

@ -1,10 +1,23 @@
use crate::octet::Octet; use crate::octet::Octet;
use crate::octet::OCTET_MUL; use crate::octet::OCTET_MUL;
#[cfg(any(target_arch = "x86", target_arch = "x86_64", feature = "use_neon"))] #[cfg(any(
target_arch = "x86",
target_arch = "x86_64",
target_arch = "arm",
target_arch = "aarch64"
))]
use crate::octet::OCTET_MUL_HI_BITS; use crate::octet::OCTET_MUL_HI_BITS;
#[cfg(any(target_arch = "x86", target_arch = "x86_64", feature = "use_neon"))] #[cfg(any(
target_arch = "x86",
target_arch = "x86_64",
target_arch = "arm",
target_arch = "aarch64"
))]
use crate::octet::OCTET_MUL_LOW_BITS; use crate::octet::OCTET_MUL_LOW_BITS;
#[cfg(target_arch = "aarch64")]
use std::arch::is_aarch64_feature_detected;
// An octet vec containing only binary values, which are bit-packed for efficiency // An octet vec containing only binary values, which are bit-packed for efficiency
pub struct BinaryOctetVec { pub struct BinaryOctetVec {
// Values are stored packed into the highest bits, with the last value at the highest bit of the // Values are stored packed into the highest bits, with the last value at the highest bit of the
@ -84,7 +97,7 @@ pub fn fused_addassign_mul_scalar_binary(
} }
} }
} }
#[cfg(all(target_arch = "aarch64", feature = "use_neon"))] #[cfg(target_arch = "aarch64")]
{ {
if is_aarch64_feature_detected!("neon") { if is_aarch64_feature_detected!("neon") {
unsafe { unsafe {
@ -92,13 +105,14 @@ pub fn fused_addassign_mul_scalar_binary(
} }
} }
} }
#[cfg(all(target_arch = "arm", feature = "use_neon"))] #[cfg(target_arch = "arm")]
{ {
if is_arm_feature_detected!("neon") { // TODO: enable when stable
unsafe { // if is_arm_feature_detected!("neon") {
return fused_addassign_mul_scalar_binary_neon(octets, other, scalar); // unsafe {
} // return fused_addassign_mul_scalar_binary_neon(octets, other, scalar);
} // }
// }
} }
// TODO: write an optimized fallback that does call .to_octet_vec() // TODO: write an optimized fallback that does call .to_octet_vec()
@ -109,11 +123,10 @@ pub fn fused_addassign_mul_scalar_binary(
} }
} }
#[cfg(all( #[cfg(target_arch = "aarch64")]
any(target_arch = "arm", target_arch = "aarch64"), // TODO: enable when stable
feature = "use_neon" // #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
))] // #[target_feature(enable = "neon")]
#[target_feature(enable = "neon")]
unsafe fn fused_addassign_mul_scalar_binary_neon( unsafe fn fused_addassign_mul_scalar_binary_neon(
octets: &mut [u8], octets: &mut [u8],
other: &BinaryOctetVec, other: &BinaryOctetVec,
@ -265,11 +278,10 @@ fn mulassign_scalar_fallback(octets: &mut [u8], scalar: &Octet) {
} }
} }
#[cfg(all( #[cfg(target_arch = "aarch64")]
any(target_arch = "arm", target_arch = "aarch64"), // TODO: enable when stable
feature = "use_neon" // #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
))] // #[target_feature(enable = "neon")]
#[target_feature(enable = "neon")]
unsafe fn mulassign_scalar_neon(octets: &mut [u8], scalar: &Octet) { unsafe fn mulassign_scalar_neon(octets: &mut [u8], scalar: &Octet) {
#[cfg(target_arch = "aarch64")] #[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*; use std::arch::aarch64::*;
@ -402,7 +414,7 @@ pub fn mulassign_scalar(octets: &mut [u8], scalar: &Octet) {
} }
} }
} }
#[cfg(all(target_arch = "aarch64", feature = "use_neon"))] #[cfg(target_arch = "aarch64")]
{ {
if is_aarch64_feature_detected!("neon") { if is_aarch64_feature_detected!("neon") {
unsafe { unsafe {
@ -410,13 +422,14 @@ pub fn mulassign_scalar(octets: &mut [u8], scalar: &Octet) {
} }
} }
} }
#[cfg(all(target_arch = "arm", feature = "use_neon"))] #[cfg(target_arch = "arm")]
{ {
if is_arm_feature_detected!("neon") { // TODO: enable when stable
unsafe { // if is_arm_feature_detected!("neon") {
return mulassign_scalar_neon(octets, scalar); // unsafe {
} // return mulassign_scalar_neon(octets, scalar);
} // }
// }
} }
return mulassign_scalar_fallback(octets, scalar); return mulassign_scalar_fallback(octets, scalar);
@ -433,11 +446,10 @@ fn fused_addassign_mul_scalar_fallback(octets: &mut [u8], other: &[u8], scalar:
} }
} }
#[cfg(all( #[cfg(target_arch = "aarch64")]
any(target_arch = "arm", target_arch = "aarch64"), // TODO: enable when stable
feature = "use_neon" // #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
))] // #[target_feature(enable = "neon")]
#[target_feature(enable = "neon")]
unsafe fn fused_addassign_mul_scalar_neon(octets: &mut [u8], other: &[u8], scalar: &Octet) { unsafe fn fused_addassign_mul_scalar_neon(octets: &mut [u8], other: &[u8], scalar: &Octet) {
#[cfg(target_arch = "aarch64")] #[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*; use std::arch::aarch64::*;
@ -603,7 +615,7 @@ pub fn fused_addassign_mul_scalar(octets: &mut [u8], other: &[u8], scalar: &Octe
} }
} }
} }
#[cfg(all(target_arch = "aarch64", feature = "use_neon"))] #[cfg(target_arch = "aarch64")]
{ {
if is_aarch64_feature_detected!("neon") { if is_aarch64_feature_detected!("neon") {
unsafe { unsafe {
@ -611,13 +623,14 @@ pub fn fused_addassign_mul_scalar(octets: &mut [u8], other: &[u8], scalar: &Octe
} }
} }
} }
#[cfg(all(target_arch = "arm", feature = "use_neon"))] #[cfg(target_arch = "arm")]
{ {
if is_arm_feature_detected!("neon") { // TODO: enable when stable
unsafe { // if is_arm_feature_detected!("neon") {
return fused_addassign_mul_scalar_neon(octets, other, scalar); // unsafe {
} // return fused_addassign_mul_scalar_neon(octets, other, scalar);
} // }
// }
} }
return fused_addassign_mul_scalar_fallback(octets, other, scalar); return fused_addassign_mul_scalar_fallback(octets, other, scalar);
@ -646,16 +659,16 @@ fn add_assign_fallback(octets: &mut [u8], other: &[u8]) {
} }
} }
#[cfg(all(target_arch = "aarch64", feature = "use_neon"))] #[cfg(target_arch = "aarch64")]
use std::arch::aarch64::uint8x16_t; use std::arch::aarch64::uint8x16_t;
#[cfg(all(target_arch = "arm", feature = "use_neon"))] // TODO: enable when stable
use std::arch::arm::uint8x16_t; // #[cfg(target_arch = "arm")]
// use std::arch::arm::uint8x16_t;
#[cfg(all( #[cfg(target_arch = "aarch64")]
any(target_arch = "arm", target_arch = "aarch64"), // TODO: enable when stable
feature = "use_neon" // #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
))] // #[target_feature(enable = "neon")]
#[target_feature(enable = "neon")]
unsafe fn store_neon(ptr: *mut uint8x16_t, value: uint8x16_t) { unsafe fn store_neon(ptr: *mut uint8x16_t, value: uint8x16_t) {
#[cfg(target_arch = "aarch64")] #[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*; use std::arch::aarch64::*;
@ -668,11 +681,10 @@ unsafe fn store_neon(ptr: *mut uint8x16_t, value: uint8x16_t) {
*(ptr as *mut u64).add(1) = vgetq_lane_u64(reinterp, 1); *(ptr as *mut u64).add(1) = vgetq_lane_u64(reinterp, 1);
} }
#[cfg(all( #[cfg(target_arch = "aarch64")]
any(target_arch = "arm", target_arch = "aarch64"), // TODO: enable when stable
feature = "use_neon" // #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
))] // #[target_feature(enable = "neon")]
#[target_feature(enable = "neon")]
unsafe fn add_assign_neon(octets: &mut [u8], other: &[u8]) { unsafe fn add_assign_neon(octets: &mut [u8], other: &[u8]) {
#[cfg(target_arch = "aarch64")] #[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*; use std::arch::aarch64::*;
@ -806,7 +818,7 @@ pub fn add_assign(octets: &mut [u8], other: &[u8]) {
} }
} }
} }
#[cfg(all(target_arch = "aarch64", feature = "use_neon"))] #[cfg(target_arch = "aarch64")]
{ {
if is_aarch64_feature_detected!("neon") { if is_aarch64_feature_detected!("neon") {
unsafe { unsafe {
@ -814,13 +826,14 @@ pub fn add_assign(octets: &mut [u8], other: &[u8]) {
} }
} }
} }
#[cfg(all(target_arch = "arm", feature = "use_neon"))] #[cfg(target_arch = "arm")]
{ {
if is_arm_feature_detected!("neon") { // TODO: enable when stable
unsafe { // if is_arm_feature_detected!("neon") {
return add_assign_neon(octets, other); // unsafe {
} // return add_assign_neon(octets, other);
} // }
// }
} }
return add_assign_fallback(octets, other); return add_assign_fallback(octets, other);
} }