From 43215c96a455742319a6b00ee11271dee4bd2f02 Mon Sep 17 00:00:00 2001 From: Don MacAskill Date: Wed, 11 Dec 2024 14:22:17 -0800 Subject: [PATCH] Fix support for x86 (32-bit) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit x86 (32-bit) systems don’t have access to ‘_mm_extract_epi64’ in SSE4.1, so an alternate approach is required to support those systems. --- src/pclmulqdq/mod.rs | 3 +- src/pclmulqdq/x86.rs | 30 ++++++++++++++++---- src/pclmulqdq/x86_64.rs | 61 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 7 deletions(-) create mode 100644 src/pclmulqdq/x86_64.rs diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs index 92e1d13..3e38f2c 100644 --- a/src/pclmulqdq/mod.rs +++ b/src/pclmulqdq/mod.rs @@ -8,8 +8,9 @@ //! [white paper]: https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf #[cfg(not(feature = "fake-simd"))] -#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86.rs")] +#[cfg_attr(target_arch = "x86_64", path = "x86_64.rs")] #[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")] +#[cfg_attr(target_arch = "x86", path = "x86.rs")] mod arch; #[cfg(feature = "fake-simd")] diff --git a/src/pclmulqdq/x86.rs b/src/pclmulqdq/x86.rs index 2fce99a..0c9d728 100644 --- a/src/pclmulqdq/x86.rs +++ b/src/pclmulqdq/x86.rs @@ -1,11 +1,9 @@ // Copyright 2020 TiKV Project Authors. Licensed under MIT or Apache-2.0. -//! x86/x86_64 implementation of the PCLMULQDQ-based CRC calculation. +//! x86 (32-bit) implementation of the PCLMULQDQ-based CRC calculation. #[cfg(target_arch = "x86")] use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; use std::ops::BitXor; #[repr(transparent)] @@ -16,13 +14,25 @@ impl super::SimdExt for Simd { fn is_supported() -> bool { is_x86_feature_detected!("pclmulqdq") // _mm_clmulepi64_si128 && is_x86_feature_detected!("sse2") // (all other _mm_*) - && is_x86_feature_detected!("sse4.1") // _mm_extract_epi64 + && is_x86_feature_detected!("sse4.1") } #[inline] #[target_feature(enable = "sse2")] unsafe fn new(high: u64, low: u64) -> Self { - Self(_mm_set_epi64x(high as i64, low as i64)) + // On 32-bit systems, we need to split u64 into low and high 32-bit parts + let high_low = (high & 0xFFFFFFFF) as i32; + let high_high = ((high >> 32) & 0xFFFFFFFF) as i32; + let low_low = (low & 0xFFFFFFFF) as i32; + let low_high = ((low >> 32) & 0xFFFFFFFF) as i32; + + // Create the 128-bit vector using 32-bit parts + Self(_mm_set_epi32( + high_high, + high_low, + low_high, + low_low, + )) } #[inline] @@ -50,7 +60,15 @@ impl super::SimdExt for Simd { let h = Self(_mm_slli_si128(t1, 8)); let l = Self(_mm_clmulepi64_si128(t1, polymu.0, 0x10)); let reduced = h ^ l ^ self; - _mm_extract_epi64(reduced.0, 1) as u64 + + // Store the result in memory and read it back as u64 + // This approach is more reliable for handling 64-bit values on 32-bit systems + let mut result: [u32; 4] = [0; 4]; + _mm_storeu_si128(result.as_mut_ptr() as *mut __m128i, reduced.0); + + // Combine the two 32-bit values into a 64-bit result + // We want the high 64 bits (indices 2 and 3) + ((result[3] as u64) << 32) | (result[2] as u64) } } diff --git a/src/pclmulqdq/x86_64.rs b/src/pclmulqdq/x86_64.rs new file mode 100644 index 0000000..63abd80 --- /dev/null +++ b/src/pclmulqdq/x86_64.rs @@ -0,0 +1,61 @@ +// Copyright 2020 TiKV Project Authors. Licensed under MIT or Apache-2.0. + +//! x86_64 implementation of the PCLMULQDQ-based CRC calculation. + +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; +use std::ops::BitXor; + +#[repr(transparent)] +#[derive(Copy, Clone, Debug)] +pub struct Simd(__m128i); + +impl super::SimdExt for Simd { + fn is_supported() -> bool { + is_x86_feature_detected!("pclmulqdq") // _mm_clmulepi64_si128 + && is_x86_feature_detected!("sse2") // (all other _mm_*) + && is_x86_feature_detected!("sse4.1") // _mm_extract_epi64 + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn new(high: u64, low: u64) -> Self { + Self(_mm_set_epi64x(high as i64, low as i64)) + } + + #[inline] + #[target_feature(enable = "sse2", enable = "pclmulqdq")] + unsafe fn fold_16(self, coeff: Self) -> Self { + let h = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x11)); + let l = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x00)); + h ^ l + } + + #[inline] + #[target_feature(enable = "sse2", enable = "pclmulqdq")] + unsafe fn fold_8(self, coeff: u64) -> Self { + let coeff = Self::new(0, coeff); + let h = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x00)); + let l = Self(_mm_srli_si128(self.0, 8)); + h ^ l + } + + #[inline] + #[target_feature(enable = "sse2", enable = "sse4.1", enable = "pclmulqdq")] + unsafe fn barrett(self, poly: u64, mu: u64) -> u64 { + let polymu = Self::new(poly, mu); + let t1 = _mm_clmulepi64_si128(self.0, polymu.0, 0x00); + let h = Self(_mm_slli_si128(t1, 8)); + let l = Self(_mm_clmulepi64_si128(t1, polymu.0, 0x10)); + let reduced = h ^ l ^ self; + _mm_extract_epi64(reduced.0, 1) as u64 + } +} + +impl BitXor for Simd { + type Output = Self; + + fn bitxor(self, other: Self) -> Self { + Self(unsafe { _mm_xor_si128(self.0, other.0) }) + } +}