diff --git a/Cargo.toml b/Cargo.toml index 5832066..1384cd9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,16 +1,18 @@ [package] name = "crc64fast" -version = "1.0.0" +version = "1.1.0" authors = ["The TiKV Project Developers"] license = "MIT OR Apache-2.0" -edition = "2018" +edition = "2021" keywords = ["crc", "crc64", "simd", "checksum"] repository = "https://github.com/tikv/crc64fast" description = "SIMD accelerated CRC64 calculation" exclude = ["build_table.rs"] readme = "README.md" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +# Note: Rust 1.70 upgraded LLVM version to 16 (in particular https://reviews.llvm.org/D131047) +# Before that, the compiler is unwilling to generate the PMULL2 instruction on AArch64. +rust-version = "1.70.0" [dependencies] diff --git a/README.md b/README.md index 70745d1..78ae9d1 100644 --- a/README.md +++ b/README.md @@ -38,15 +38,6 @@ be chosen based on CPU feature at runtime. [crc 1.8.1]: https://crates.io/crates/crc -> **Note:** Since Rust has not stabilized SIMD support on AArch64, you need a -> nightly compiler and enable the `pmull` feature to use the SIMD-based -> implementation: -> -> ```toml -> [dependencies] -> crc64fast = { version = "1.0", features = ["pmull"] } -> ``` - ## TODO This crate is mainly intended for use in TiKV only. diff --git a/src/lib.rs b/src/lib.rs index 847f2c1..fe5e5f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,11 +18,6 @@ //! assert_eq!(checksum, 0x8483_c0fa_3260_7d61); //! ``` -#![cfg_attr( - feature = "pmull", - feature(stdsimd, platform_intrinsics, aarch64_target_feature, llvm_asm) -)] - mod pclmulqdq; mod table; diff --git a/src/pclmulqdq/aarch64.rs b/src/pclmulqdq/aarch64.rs index c368200..fb7d2ae 100644 --- a/src/pclmulqdq/aarch64.rs +++ b/src/pclmulqdq/aarch64.rs @@ -2,7 +2,7 @@ //! AArch64 implementation of the PCLMULQDQ-based CRC calculation. -use std::arch::aarch64::*; +use std::arch::{aarch64::*, is_aarch64_feature_detected}; use std::mem::transmute; use std::ops::BitXor; @@ -10,6 +10,9 @@ use std::ops::BitXor; #[derive(Copy, Clone, Debug)] pub struct Simd(uint8x16_t); +#[allow(non_camel_case_types)] +type poly64_t = u64; + impl Simd { #[inline] #[target_feature(enable = "neon")] @@ -52,34 +55,12 @@ impl super::SimdExt for Simd { } #[inline] - #[target_feature(enable = "crypto", enable = "neon")] + #[target_feature(enable = "aes", enable = "neon")] unsafe fn fold_16(self, coeff: Self) -> Self { - let h: Self; - let l: Self; - - // FIXME: When used as a single function, this branch is equivalent to - // the ASM below. However, when fold_16 is called inside a loop, for - // some reason LLVM replaces the PMULL2 call with a plain PMULL, which - // leads unnecessary FMOV calls and slows down the throughput from - // 20 GiB/s to 14 GiB/s. This bug does not exist with GCC. Delete the - // ASM code once this misoptimization is fixed. - #[cfg(slow)] - { - let [x0, x1] = self.into_poly64s(); - let [c0, c1] = coeff.into_poly64s(); - h = Self::from_mul(c0, x0); - l = Self::from_mul(c1, x1); - } - #[cfg(not(slow))] - { - llvm_asm!( - "pmull $0.1q, $2.1d, $3.1d - pmull2 $1.1q, $2.2d, $3.2d" - : "=&w"(l), "=w"(h) - : "w"(self), "w"(coeff) - ); - } - + let [x0, x1] = self.into_poly64s(); + let [c0, c1] = coeff.into_poly64s(); + let h = Self::from_mul(c0, x0); + let l = Self::from_mul(c1, x1); h ^ l } @@ -110,36 +91,3 @@ impl BitXor for Simd { unsafe { Self(veorq_u8(self.0, other.0)) } } } - -//------------------------------------------------------------------------------ -// -// Below are intrinsics not yet included in Rust. - -extern "platform-intrinsic" { - fn simd_extract(x: T, idx: u32) -> U; -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn vgetq_lane_p64(a: poly64x2_t, idx: u32) -> poly64_t { - let elem: i64 = simd_extract(a, idx); - transmute(elem) -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn vreinterpretq_u8_p128(a: poly128_t) -> uint8x16_t { - transmute(a) -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn vreinterpretq_p64_u8(a: uint8x16_t) -> poly64x2_t { - transmute(a) -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn vcreate_u8(value: u64) -> uint8x8_t { - transmute(value) -} diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs index b329808..a8efe32 100644 --- a/src/pclmulqdq/mod.rs +++ b/src/pclmulqdq/mod.rs @@ -9,7 +9,7 @@ #[cfg(not(feature = "fake-simd"))] #[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86.rs")] -#[cfg_attr(all(target_arch = "aarch64", feature = "pmull"), path = "aarch64.rs")] +#[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")] mod arch; #[cfg(feature = "fake-simd")] @@ -93,8 +93,8 @@ fn update(mut state: u64, bytes: &[u8]) -> u64 { target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1") )] #[cfg_attr( - all(target_arch = "aarch64", feature = "pmull"), - target_feature(enable = "crypto", enable = "neon") + target_arch = "aarch64", + target_feature(enable = "aes", enable = "neon") )] unsafe fn update_simd(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]]) -> u64 { // receive the initial 128 bytes of data