Skip to content

Commit

Permalink
pclmulqdq: Upgrade rust version to use stabilized AArch64 intrinsics.
Browse files Browse the repository at this point in the history
Signed-off-by: kennytm <[email protected]>
  • Loading branch information
kennytm committed Jan 17, 2024
1 parent baedd67 commit 3ffba7e
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 81 deletions.
8 changes: 5 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
[package]
name = "crc64fast"
version = "1.0.0"
version = "1.1.0"
authors = ["The TiKV Project Developers"]
license = "MIT OR Apache-2.0"
edition = "2018"
edition = "2021"
keywords = ["crc", "crc64", "simd", "checksum"]
repository = "https://github.com/tikv/crc64fast"
description = "SIMD accelerated CRC64 calculation"
exclude = ["build_table.rs"]
readme = "README.md"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
# Note: Rust 1.70 upgraded LLVM version to 16 (in particular https://reviews.llvm.org/D131047)
# Before that, the compiler is unwilling to generate the PMULL2 instruction on AArch64.
rust-version = "1.70.0"

[dependencies]

Expand Down
9 changes: 0 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,6 @@ be chosen based on CPU feature at runtime.

[crc 1.8.1]: https://crates.io/crates/crc

> **Note:** Since Rust has not stabilized SIMD support on AArch64, you need a
> nightly compiler and enable the `pmull` feature to use the SIMD-based
> implementation:
>
> ```toml
> [dependencies]
> crc64fast = { version = "1.0", features = ["pmull"] }
> ```
## TODO

This crate is mainly intended for use in TiKV only.
Expand Down
5 changes: 0 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,6 @@
//! assert_eq!(checksum, 0x8483_c0fa_3260_7d61);
//! ```
#![cfg_attr(
feature = "pmull",
feature(stdsimd, platform_intrinsics, aarch64_target_feature, llvm_asm)
)]

mod pclmulqdq;
mod table;

Expand Down
70 changes: 9 additions & 61 deletions src/pclmulqdq/aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

//! AArch64 implementation of the PCLMULQDQ-based CRC calculation.
use std::arch::aarch64::*;
use std::arch::{aarch64::*, is_aarch64_feature_detected};
use std::mem::transmute;
use std::ops::BitXor;

#[repr(transparent)]
#[derive(Copy, Clone, Debug)]
pub struct Simd(uint8x16_t);

#[allow(non_camel_case_types)]
type poly64_t = u64;

impl Simd {
#[inline]
#[target_feature(enable = "neon")]
Expand Down Expand Up @@ -52,34 +55,12 @@ impl super::SimdExt for Simd {
}

#[inline]
#[target_feature(enable = "crypto", enable = "neon")]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn fold_16(self, coeff: Self) -> Self {
let h: Self;
let l: Self;

// FIXME: When used as a single function, this branch is equivalent to
// the ASM below. However, when fold_16 is called inside a loop, for
// some reason LLVM replaces the PMULL2 call with a plain PMULL, which
// leads unnecessary FMOV calls and slows down the throughput from
// 20 GiB/s to 14 GiB/s. This bug does not exist with GCC. Delete the
// ASM code once this misoptimization is fixed.
#[cfg(slow)]
{
let [x0, x1] = self.into_poly64s();
let [c0, c1] = coeff.into_poly64s();
h = Self::from_mul(c0, x0);
l = Self::from_mul(c1, x1);
}
#[cfg(not(slow))]
{
llvm_asm!(
"pmull $0.1q, $2.1d, $3.1d
pmull2 $1.1q, $2.2d, $3.2d"
: "=&w"(l), "=w"(h)
: "w"(self), "w"(coeff)
);
}

let [x0, x1] = self.into_poly64s();
let [c0, c1] = coeff.into_poly64s();
let h = Self::from_mul(c0, x0);
let l = Self::from_mul(c1, x1);
h ^ l
}

Expand Down Expand Up @@ -110,36 +91,3 @@ impl BitXor for Simd {
unsafe { Self(veorq_u8(self.0, other.0)) }
}
}

//------------------------------------------------------------------------------
//
// Below are intrinsics not yet included in Rust.

extern "platform-intrinsic" {
fn simd_extract<T, U>(x: T, idx: u32) -> U;
}

#[inline]
#[target_feature(enable = "neon")]
unsafe fn vgetq_lane_p64(a: poly64x2_t, idx: u32) -> poly64_t {
let elem: i64 = simd_extract(a, idx);
transmute(elem)
}

#[inline]
#[target_feature(enable = "neon")]
unsafe fn vreinterpretq_u8_p128(a: poly128_t) -> uint8x16_t {
transmute(a)
}

#[inline]
#[target_feature(enable = "neon")]
unsafe fn vreinterpretq_p64_u8(a: uint8x16_t) -> poly64x2_t {
transmute(a)
}

#[inline]
#[target_feature(enable = "neon")]
unsafe fn vcreate_u8(value: u64) -> uint8x8_t {
transmute(value)
}
6 changes: 3 additions & 3 deletions src/pclmulqdq/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#[cfg(not(feature = "fake-simd"))]
#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86.rs")]
#[cfg_attr(all(target_arch = "aarch64", feature = "pmull"), path = "aarch64.rs")]
#[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")]
mod arch;

#[cfg(feature = "fake-simd")]
Expand Down Expand Up @@ -93,8 +93,8 @@ fn update(mut state: u64, bytes: &[u8]) -> u64 {
target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")
)]
#[cfg_attr(
all(target_arch = "aarch64", feature = "pmull"),
target_feature(enable = "crypto", enable = "neon")
target_arch = "aarch64",
target_feature(enable = "aes", enable = "neon")
)]
unsafe fn update_simd(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]]) -> u64 {
// receive the initial 128 bytes of data
Expand Down

0 comments on commit 3ffba7e

Please sign in to comment.