diff --git a/Cargo.toml b/Cargo.toml
index 5832066..1384cd9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,16 +1,18 @@
 [package]
 name = "crc64fast"
-version = "1.0.0"
+version = "1.1.0"
 authors = ["The TiKV Project Developers"]
 license = "MIT OR Apache-2.0"
-edition = "2018"
+edition = "2021"
 keywords = ["crc", "crc64", "simd", "checksum"]
 repository = "https://github.com/tikv/crc64fast"
 description = "SIMD accelerated CRC64 calculation"
 exclude = ["build_table.rs"]
 readme = "README.md"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+# Note: Rust 1.70 upgraded LLVM version to 16 (in particular https://reviews.llvm.org/D131047)
+# Before that, the compiler is unwilling to generate the PMULL2 instruction on AArch64.
+rust-version = "1.70.0"
 
 [dependencies]
 
diff --git a/README.md b/README.md
index 70745d1..78ae9d1 100644
--- a/README.md
+++ b/README.md
@@ -38,15 +38,6 @@ be chosen based on CPU feature at runtime.
 
 [crc 1.8.1]: https://crates.io/crates/crc
 
-> **Note:** Since Rust has not stabilized SIMD support on AArch64, you need a
-> nightly compiler and enable the `pmull` feature to use the SIMD-based
-> implementation:
->
-> ```toml
-> [dependencies]
-> crc64fast = { version = "1.0", features = ["pmull"] }
-> ```
-
 ## TODO
 
 This crate is mainly intended for use in TiKV only.
diff --git a/src/lib.rs b/src/lib.rs
index 847f2c1..fe5e5f3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -18,11 +18,6 @@
 //! assert_eq!(checksum, 0x8483_c0fa_3260_7d61);
 //! ```
 
-#![cfg_attr(
-    feature = "pmull",
-    feature(stdsimd, platform_intrinsics, aarch64_target_feature, llvm_asm)
-)]
-
 mod pclmulqdq;
 mod table;
 
diff --git a/src/pclmulqdq/aarch64.rs b/src/pclmulqdq/aarch64.rs
index c368200..fb7d2ae 100644
--- a/src/pclmulqdq/aarch64.rs
+++ b/src/pclmulqdq/aarch64.rs
@@ -2,7 +2,7 @@
 
 //! AArch64 implementation of the PCLMULQDQ-based CRC calculation.
 
-use std::arch::aarch64::*;
+use std::arch::{aarch64::*, is_aarch64_feature_detected};
 use std::mem::transmute;
 use std::ops::BitXor;
 
@@ -10,6 +10,9 @@ use std::ops::BitXor;
 #[derive(Copy, Clone, Debug)]
 pub struct Simd(uint8x16_t);
 
+#[allow(non_camel_case_types)]
+type poly64_t = u64;
+
 impl Simd {
     #[inline]
     #[target_feature(enable = "neon")]
@@ -52,34 +55,12 @@ impl super::SimdExt for Simd {
     }
 
     #[inline]
-    #[target_feature(enable = "crypto", enable = "neon")]
+    #[target_feature(enable = "aes", enable = "neon")]
     unsafe fn fold_16(self, coeff: Self) -> Self {
-        let h: Self;
-        let l: Self;
-
-        // FIXME: When used as a single function, this branch is equivalent to
-        // the ASM below. However, when fold_16 is called inside a loop, for
-        // some reason LLVM replaces the PMULL2 call with a plain PMULL, which
-        // leads unnecessary FMOV calls and slows down the throughput from
-        // 20 GiB/s to 14 GiB/s. This bug does not exist with GCC. Delete the
-        // ASM code once this misoptimization is fixed.
-        #[cfg(slow)]
-        {
-            let [x0, x1] = self.into_poly64s();
-            let [c0, c1] = coeff.into_poly64s();
-            h = Self::from_mul(c0, x0);
-            l = Self::from_mul(c1, x1);
-        }
-        #[cfg(not(slow))]
-        {
-            llvm_asm!(
-                "pmull $0.1q, $2.1d, $3.1d
-                pmull2 $1.1q, $2.2d, $3.2d"
-                : "=&w"(l), "=w"(h)
-                : "w"(self), "w"(coeff)
-            );
-        }
-
+        let [x0, x1] = self.into_poly64s();
+        let [c0, c1] = coeff.into_poly64s();
+        let h = Self::from_mul(c0, x0);
+        let l = Self::from_mul(c1, x1);
         h ^ l
     }
 
@@ -110,36 +91,3 @@ impl BitXor for Simd {
         unsafe { Self(veorq_u8(self.0, other.0)) }
     }
 }
-
-//------------------------------------------------------------------------------
-//
-// Below are intrinsics not yet included in Rust.
-
-extern "platform-intrinsic" {
-    fn simd_extract<T, U>(x: T, idx: u32) -> U;
-}
-
-#[inline]
-#[target_feature(enable = "neon")]
-unsafe fn vgetq_lane_p64(a: poly64x2_t, idx: u32) -> poly64_t {
-    let elem: i64 = simd_extract(a, idx);
-    transmute(elem)
-}
-
-#[inline]
-#[target_feature(enable = "neon")]
-unsafe fn vreinterpretq_u8_p128(a: poly128_t) -> uint8x16_t {
-    transmute(a)
-}
-
-#[inline]
-#[target_feature(enable = "neon")]
-unsafe fn vreinterpretq_p64_u8(a: uint8x16_t) -> poly64x2_t {
-    transmute(a)
-}
-
-#[inline]
-#[target_feature(enable = "neon")]
-unsafe fn vcreate_u8(value: u64) -> uint8x8_t {
-    transmute(value)
-}
diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs
index b329808..a8efe32 100644
--- a/src/pclmulqdq/mod.rs
+++ b/src/pclmulqdq/mod.rs
@@ -9,7 +9,7 @@
 
 #[cfg(not(feature = "fake-simd"))]
 #[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86.rs")]
-#[cfg_attr(all(target_arch = "aarch64", feature = "pmull"), path = "aarch64.rs")]
+#[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")]
 mod arch;
 
 #[cfg(feature = "fake-simd")]
@@ -93,8 +93,8 @@ fn update(mut state: u64, bytes: &[u8]) -> u64 {
     target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")
 )]
 #[cfg_attr(
-    all(target_arch = "aarch64", feature = "pmull"),
-    target_feature(enable = "crypto", enable = "neon")
+    target_arch = "aarch64",
+    target_feature(enable = "aes", enable = "neon")
 )]
 unsafe fn update_simd(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]]) -> u64 {
     // receive the initial 128 bytes of data