diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs
index 92e1d13..3e38f2c 100644
--- a/src/pclmulqdq/mod.rs
+++ b/src/pclmulqdq/mod.rs
@@ -8,8 +8,9 @@
 //! [white paper]: https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 
 #[cfg(not(feature = "fake-simd"))]
-#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86.rs")]
+#[cfg_attr(target_arch = "x86_64", path = "x86_64.rs")]
 #[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")]
+#[cfg_attr(target_arch = "x86", path = "x86.rs")]
 mod arch;
 
 #[cfg(feature = "fake-simd")]
diff --git a/src/pclmulqdq/x86.rs b/src/pclmulqdq/x86.rs
index 2fce99a..55532c1 100644
--- a/src/pclmulqdq/x86.rs
+++ b/src/pclmulqdq/x86.rs
@@ -1,11 +1,9 @@
 // Copyright 2020 TiKV Project Authors. Licensed under MIT or Apache-2.0.
 
-//! x86/x86_64 implementation of the PCLMULQDQ-based CRC calculation.
+//! x86 (32-bit) implementation of the PCLMULQDQ-based CRC calculation.
 
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
 use std::ops::BitXor;
 
 #[repr(transparent)]
@@ -16,13 +14,20 @@ impl super::SimdExt for Simd {
     fn is_supported() -> bool {
         is_x86_feature_detected!("pclmulqdq") // _mm_clmulepi64_si128
             && is_x86_feature_detected!("sse2") // (all other _mm_*)
-            && is_x86_feature_detected!("sse4.1") // _mm_extract_epi64
+            && is_x86_feature_detected!("sse4.1")
     }
 
     #[inline]
     #[target_feature(enable = "sse2")]
     unsafe fn new(high: u64, low: u64) -> Self {
-        Self(_mm_set_epi64x(high as i64, low as i64))
+        // On 32-bit systems, we need to split u64 into low and high 32-bit parts
+        let high_low = (high & 0xFFFFFFFF) as i32;
+        let high_high = ((high >> 32) & 0xFFFFFFFF) as i32;
+        let low_low = (low & 0xFFFFFFFF) as i32;
+        let low_high = ((low >> 32) & 0xFFFFFFFF) as i32;
+
+        // Create the 128-bit vector using 32-bit parts
+        Self(_mm_set_epi32(high_high, high_low, low_high, low_low))
     }
 
     #[inline]
@@ -50,7 +55,15 @@ impl super::SimdExt for Simd {
         let h = Self(_mm_slli_si128(t1, 8));
         let l = Self(_mm_clmulepi64_si128(t1, polymu.0, 0x10));
         let reduced = h ^ l ^ self;
-        _mm_extract_epi64(reduced.0, 1) as u64
+
+        // Store the result in memory and read it back as u64
+        // This approach is more reliable for handling 64-bit values on 32-bit systems
+        let mut result: [u32; 4] = [0; 4];
+        _mm_storeu_si128(result.as_mut_ptr() as *mut __m128i, reduced.0);
+
+        // Combine the two 32-bit values into a 64-bit result
+        // We want the high 64 bits (indices 2 and 3)
+        ((result[3] as u64) << 32) | (result[2] as u64)
     }
 }
 
diff --git a/src/pclmulqdq/x86_64.rs b/src/pclmulqdq/x86_64.rs
new file mode 100644
index 0000000..63abd80
--- /dev/null
+++ b/src/pclmulqdq/x86_64.rs
@@ -0,0 +1,61 @@
+// Copyright 2020 TiKV Project Authors. Licensed under MIT or Apache-2.0.
+
+//! x86_64 implementation of the PCLMULQDQ-based CRC calculation.
+
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::ops::BitXor;
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Debug)]
+pub struct Simd(__m128i);
+
+impl super::SimdExt for Simd {
+    fn is_supported() -> bool {
+        is_x86_feature_detected!("pclmulqdq") // _mm_clmulepi64_si128
+            && is_x86_feature_detected!("sse2") // (all other _mm_*)
+            && is_x86_feature_detected!("sse4.1") // _mm_extract_epi64
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn new(high: u64, low: u64) -> Self {
+        Self(_mm_set_epi64x(high as i64, low as i64))
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2", enable = "pclmulqdq")]
+    unsafe fn fold_16(self, coeff: Self) -> Self {
+        let h = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x11));
+        let l = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x00));
+        h ^ l
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2", enable = "pclmulqdq")]
+    unsafe fn fold_8(self, coeff: u64) -> Self {
+        let coeff = Self::new(0, coeff);
+        let h = Self(_mm_clmulepi64_si128(self.0, coeff.0, 0x00));
+        let l = Self(_mm_srli_si128(self.0, 8));
+        h ^ l
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2", enable = "sse4.1", enable = "pclmulqdq")]
+    unsafe fn barrett(self, poly: u64, mu: u64) -> u64 {
+        let polymu = Self::new(poly, mu);
+        let t1 = _mm_clmulepi64_si128(self.0, polymu.0, 0x00);
+        let h = Self(_mm_slli_si128(t1, 8));
+        let l = Self(_mm_clmulepi64_si128(t1, polymu.0, 0x10));
+        let reduced = h ^ l ^ self;
+        _mm_extract_epi64(reduced.0, 1) as u64
+    }
+}
+
+impl BitXor for Simd {
+    type Output = Self;
+
+    fn bitxor(self, other: Self) -> Self {
+        Self(unsafe { _mm_xor_si128(self.0, other.0) })
+    }
+}