From 135cbf492c254399e505615edbefbc3b7feb626a Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 10 Dec 2024 20:01:03 +0800 Subject: [PATCH 1/5] x86 sse: add loongarch lsx optimized implementations --- simde/x86/sse.h | 221 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 169 insertions(+), 52 deletions(-) diff --git a/simde/x86/sse.h b/simde/x86/sse.h index 6bdf20cee..110142ca9 100644 --- a/simde/x86/sse.h +++ b/simde/x86/sse.h @@ -658,6 +658,8 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.neon_f32 = vrndiq_f32(a_.neon_f32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f32x4_nearest(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfrintrne_s(a_.lsx_f32); #elif defined(simde_math_nearbyintf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -772,6 +774,9 @@ simde_mm_set_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_flo r_.neon_f32 = vld1q_f32(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_TO_16 simde_float32 data[4] = { e0, e1, e2, e3 }; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.f32[0] = e0; r_.f32[1] = e1; @@ -936,6 +941,8 @@ simde_mm_add_ss (simde__m128 a, simde__m128 b) { float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); // the upper values in the result must be the remnants of . r_.neon_f32 = vaddq_f32(a_.neon_f32, value); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i32, (__m128i)__lsx_vfadd_s(a_.lsx_f32, b_.lsx_f32), 0x00); #else r_.f32[0] = a_.f32[0] + b_.f32[0]; r_.f32[1] = a_.f32[1]; @@ -1261,6 +1268,10 @@ simde_x_mm_abs_ps(simde__m128 a) { r_.altivec_f32 = vec_abs(a_.altivec_f32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f32x4_abs(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vreplgr2vr_w(0); + __m128 temp1 = __lsx_vfsub_s((__m128)temp, a_.lsx_f32); + r_.lsx_f32 = __lsx_vfmax_s(a_.lsx_f32, temp1); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -1322,12 +1333,15 @@ simde_mm_cmpeq_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif } @@ -1385,12 +1399,15 @@ simde_mm_cmpge_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif } @@ -1448,12 +1465,15 @@ simde_mm_cmpgt_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif } @@ -1511,11 +1531,15 @@ simde_mm_cmple_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif @@ -1574,11 +1598,15 @@ simde_mm_cmplt_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif @@ -1638,12 +1666,15 @@ simde_mm_cmpneq_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.u32[i] = a_.u32[i]; + } + #endif return simde__m128_from_private(r_); #endif } @@ -1825,8 +1856,9 @@ simde_mm_cmpunord_ss (simde__m128 a, simde__m128 b) { r_, a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - - #if defined(simde_math_isnanf) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vor_v(__lsx_vfcmp_cune_s(a_.lsx_f32, a_.lsx_f32), __lsx_vfcmp_cune_s(b_.lsx_f32, b_.lsx_f32)), 0); + #elif defined(simde_math_isnanf) r_.u32[0] = (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -1861,6 +1893,8 @@ simde_mm_comieq_ss (simde__m128 a, simde__m128 b) { return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f32x4_extract_lane(a_.wasm_v128, 0) == wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32), 0); #else return a_.f32[0] == b_.f32[0]; #endif @@ -1888,6 +1922,8 @@ simde_mm_comige_ss (simde__m128 a, simde__m128 b) { return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f32x4_extract_lane(a_.wasm_v128, 0) >= wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32), 0); #else return a_.f32[0] >= b_.f32[0]; #endif @@ -1915,6 +1951,8 @@ simde_mm_comigt_ss (simde__m128 a, simde__m128 b) { return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f32x4_extract_lane(a_.wasm_v128, 0) > wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32), 0); #else return a_.f32[0] > b_.f32[0]; #endif @@ -1942,6 +1980,8 @@ simde_mm_comile_ss (simde__m128 a, simde__m128 b) { return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f32x4_extract_lane(a_.wasm_v128, 0) <= wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32), 0); #else return a_.f32[0] <= b_.f32[0]; #endif @@ -1969,6 +2009,8 @@ simde_mm_comilt_ss (simde__m128 a, simde__m128 b) { return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f32x4_extract_lane(a_.wasm_v128, 0) < wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32), 0); #else return a_.f32[0] < b_.f32[0]; #endif @@ -1996,6 +2038,8 @@ simde_mm_comineq_ss (simde__m128 a, simde__m128 b) { return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f32x4_extract_lane(a_.wasm_v128, 0) != wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_w(__lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32), 0); #else return a_.f32[0] != b_.f32[0]; #endif @@ -2125,6 +2169,9 @@ simde_mm_cvt_si2ss (simde__m128 a, int32_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float, b), a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + float b_temp = (float)b; + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i32, __lsx_vldrepl_w(&b_temp, 0), 0); #else r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); r_.i32[1] = a_.i32[1]; @@ -2146,6 +2193,8 @@ simde_mm_cvt_ss2si (simde__m128 a) { return _mm_cvt_ss2si(a); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399) return vgetq_lane_s32(vcvtnq_s32_f32(simde__m128_to_neon_f32(a)), 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) + return __lsx_vpickve2gr_w(__lsx_vftintrne_w_s(simde__m128_to_lsx_f32(a)), 0); #else simde__m128_private a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION)); #if !defined(SIMDE_FAST_CONVERSION_RANGE) @@ -2172,6 +2221,8 @@ simde_mm_cvtpi16_ps (simde__m64 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(a_.neon_i16)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vffint_s_w(__lsx_vsllwil_w_h(__lsx_vldrepl_d(&a_.i16, 0), 0)); #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.f32, a_.i16); #else @@ -2430,6 +2481,9 @@ simde_mm_cvtsi32_ss (simde__m128 a, int32_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b), a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde_float32 b_temp = (simde_float32)b; + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vldrepl_w(&(b_temp), 0), 0); #else r_ = a_; r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); @@ -2457,6 +2511,9 @@ simde_mm_cvtsi64_ss (simde__m128 a, int64_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b), a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde_float32 b_temp = (simde_float32)b; + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vldrepl_w(&(b_temp), 0), 0); #else r_ = a_; r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); @@ -2478,6 +2535,10 @@ simde_mm_cvtss_f32 (simde__m128 a) { simde__m128_private a_ = simde__m128_to_private(a); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vgetq_lane_f32(a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde_float32 temp; + __lsx_vstelm_w(a_.lsx_f32, &temp, 0, 0); + return temp; #else return a_.f32[0]; #endif @@ -2509,6 +2570,8 @@ simde_mm_cvtss_si64 (simde__m128 a) { simde__m128_private a_ = simde__m128_to_private(a); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(vgetq_lane_f32(a_.neon_f32, 0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vpickve2gr_d(__lsx_vftintrne_l_d(__lsx_vfcvtl_d_s(a_.lsx_f32)), 0); #else return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(a_.f32[0])); #endif @@ -2561,6 +2624,8 @@ simde_mm_cvtt_ss2si (simde__m128 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) + return __lsx_vpickve2gr_w(__lsx_vftintrz_w_s(a_.lsx_f32), 0); #else simde_float32 v = a_.f32[0]; #if !defined(SIMDE_FAST_CONVERSION_RANGE) @@ -2592,6 +2657,8 @@ simde_mm_cvttss_si64 (simde__m128 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return SIMDE_CONVERT_FTOI(int64_t, __lsx_vpickve2gr_w(__lsx_vftintrz_w_s(a_.lsx_f32), 0)); #else return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]); #endif @@ -2615,7 +2682,11 @@ simde_mm_cmpord_ss (simde__m128 a, simde__m128 b) { r_, a_ = simde__m128_to_private(a); - #if defined(simde_math_isnanf) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128_private b_ = simde__m128_to_private(b); + __m128i temp = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vnor_v(temp, temp), 0); + #elif defined(simde_math_isnanf) r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) || simde_math_isnanf(simde_mm_cvtss_f32(b))) ? UINT32_C(0) : ~UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -2690,6 +2761,8 @@ simde_mm_div_ss (simde__m128 a, simde__m128 b) { float32_t value = vgetq_lane_f32(simde__m128_to_private(simde_mm_div_ps(a, b)).neon_f32, 0); r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i32, (__m128i)__lsx_vfdiv_s(a_.lsx_f32, b_.lsx_f32), 0); #else r_.f32[0] = a_.f32[0] / b_.f32[0]; SIMDE_VECTORIZE @@ -2811,6 +2884,8 @@ simde_mm_load_ss (simde_float32 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbsrl_v(__lsx_vldrepl_w(mem_addr, 0), 12); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_load32_zero(mem_addr); #else @@ -2841,6 +2916,8 @@ simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) { r_.neon_f32 = vcombine_f32(vget_low_f32(a_.neon_f32), vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_load64_lane(mem_addr, a_.wasm_v128, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(__lsx_vldrepl_d(mem_addr, 0), a_.lsx_i64, 0); #else simde__m64_private b_ = *HEDLEY_REINTERPRET_CAST(simde__m64_private const*, mem_addr); r_.f32[0] = a_.f32[0]; @@ -2884,6 +2961,8 @@ simde_mm_loadl_pi (simde__m128 a, simde__m64 const* mem_addr) { HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)), vget_high_f32(a_.neon_f32)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_load64_lane(mem_addr, a_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vldrepl_d(mem_addr, 0), 0); #else simde__m64_private b_; simde_memcpy(&b_, mem_addr, sizeof(b_)); @@ -3101,6 +3180,8 @@ simde_mm_max_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32_t value = vgetq_lane_f32(maxq_f32(a_.neon_f32, b_.neon_f32), 0); r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfmax_s(a_.lsx_f32, b_.lsx_f32), 0); #else r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0]; r_.f32[1] = a_.f32[1]; @@ -3237,6 +3318,8 @@ simde_mm_min_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32_t value = vgetq_lane_f32(vminq_f32(a_.neon_f32, b_.neon_f32), 0); r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfmin_s(a_.lsx_f32, b_.lsx_f32), 0); #else r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0]; r_.f32[1] = a_.f32[1]; @@ -3456,10 +3539,14 @@ simde_mm_mul_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - r_.f32[0] = a_.f32[0] * b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfmul_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.f32[0] = a_.f32[0] * b_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + #endif return simde__m128_from_private(r_); #endif @@ -3779,11 +3866,14 @@ simde_mm_rcp_ss (simde__m128 a) { simde__m128_private r_, a_ = simde__m128_to_private(a); - - r_.f32[0] = 1.0f / a_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i32, (__m128i)__lsx_vfrecip_s(a_.lsx_f32), 0); + #else + r_.f32[0] = 1.0f / a_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + #endif return simde__m128_from_private(r_); #endif @@ -3872,6 +3962,8 @@ simde_mm_rsqrt_ss (simde__m128 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(simde_mm_rsqrt_ps(a).neon_f32, 0), a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfrsqrt_s(a_.lsx_f32), 0); #elif defined(SIMDE_IEEE754_STORAGE) { #if SIMDE_ACCURACY_PREFERENCE <= 0 @@ -3994,6 +4086,8 @@ simde_mm_setzero_ps (void) { return vec_splats(SIMDE_FLOAT32_C(0.0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f32x4_const(0.f, 0.f, 0.f, 0.f); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128)__lsx_vreplgr2vr_w(0); #else simde__m128 r; simde_memset(&r, 0, sizeof(r)); @@ -4132,6 +4226,8 @@ simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8) } #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) # define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_shuffle_ps(a, b, imm8) (simde__m128)(__lsx_vpermi_w(simde__m128_to_private(b).lsx_i64, simde__m128_to_private(a).i64, imm8)) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \ simde__m128_from_private((simde__m128_private) { .wasm_v128 = \ @@ -4227,6 +4323,8 @@ simde_mm_sqrt_ss (simde__m128 a) { float32_t value = vgetq_lane_f32(simde__m128_to_private(simde_mm_sqrt_ps(a)).neon_f32, 0); r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfsqrt_s(a_.lsx_f32), 0); #elif defined(simde_math_sqrtf) r_.f32[0] = simde_math_sqrtf(a_.f32[0]); r_.f32[1] = a_.f32[1]; @@ -4339,6 +4437,8 @@ simde_mm_storeh_pi (simde__m64* mem_addr, simde__m128 a) { vst1_f32(HEDLEY_REINTERPRET_CAST(float32_t*, mem_addr), vget_high_f32(a_.neon_f32)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(a_.lsx_i64, HEDLEY_REINTERPRET_CAST(void*, mem_addr), 0, 1); #else simde_memcpy(mem_addr, &(a_.m64[1]), sizeof(a_.m64[1])); #endif @@ -4356,6 +4456,9 @@ simde_mm_storel_pi (simde__m64* mem_addr, simde__m128 a) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) simde__m128_private a_ = simde__m128_to_private(a); wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128_private a_ = simde__m128_to_private(a); + __lsx_vstelm_d(a_.lsx_i64, HEDLEY_REINTERPRET_CAST(void*, mem_addr), 0, 0); #else simde__m64_private* dest_ = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr); simde__m128_private a_ = simde__m128_to_private(a); @@ -4476,12 +4579,14 @@ simde_mm_sub_ss (simde__m128 a, simde__m128 b) { r_, a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - - r_.f32[0] = a_.f32[0] - b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfsub_s(a_.lsx_f32, b_.lsx_f32), 0); + #else + r_.f32[0] = a_.f32[0] - b_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + #endif return simde__m128_from_private(r_); #endif } @@ -4509,6 +4614,8 @@ simde_mm_ucomieq_ss (simde__m128 a, simde__m128 b) { r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) == wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4545,6 +4652,8 @@ simde_mm_ucomige_ss (simde__m128 a, simde__m128 b) { r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) >= wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4581,6 +4690,8 @@ simde_mm_ucomigt_ss (simde__m128 a, simde__m128 b) { r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) > wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4617,6 +4728,8 @@ simde_mm_ucomile_ss (simde__m128 a, simde__m128 b) { r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) <= wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4653,6 +4766,8 @@ simde_mm_ucomilt_ss (simde__m128 a, simde__m128 b) { r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) < wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4689,6 +4804,8 @@ simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) { r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) != wasm_f32x4_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_w(__lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); From c446819d96a2dc8e2a2a19518cdd84b4b196ef52 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 10 Dec 2024 20:08:29 +0800 Subject: [PATCH 2/5] x86 sse3: add loongarch lsx optimized implementations --- simde/x86/sse3.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/simde/x86/sse3.h b/simde/x86/sse3.h index db2683c30..4f83a5105 100644 --- a/simde/x86/sse3.h +++ b/simde/x86/sse3.h @@ -48,6 +48,8 @@ simde_x_mm_deinterleaveeven_epi16 (simde__m128i a, simde__m128i b) { r_.neon_i16 = t.val[0]; #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6, 8, 10, 12, 14); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickev_h(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 2, 4, 6, 8, 10, 12, 14); #else @@ -76,6 +78,8 @@ simde_x_mm_deinterleaveodd_epi16 (simde__m128i a, simde__m128i b) { r_.neon_i16 = t.val[1]; #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7, 9, 11, 13, 15); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickod_h(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 1, 3, 5, 7, 9, 11, 13, 15); #else @@ -104,6 +108,8 @@ simde_x_mm_deinterleaveeven_epi32 (simde__m128i a, simde__m128i b) { r_.neon_i32 = t.val[0]; #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickev_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 2, 4, 6); #else @@ -132,6 +138,8 @@ simde_x_mm_deinterleaveodd_epi32 (simde__m128i a, simde__m128i b) { r_.neon_i32 = t.val[1]; #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickod_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 1, 3, 5, 7); #else @@ -160,6 +168,8 @@ simde_x_mm_deinterleaveeven_ps (simde__m128 a, simde__m128 b) { r_.neon_f32 = t.val[0]; #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickev_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 2, 4, 6); #else @@ -188,6 +198,8 @@ simde_x_mm_deinterleaveodd_ps (simde__m128 a, simde__m128 b) { r_.neon_f32 = t.val[1]; #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickod_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 1, 3, 5, 7); #else @@ -213,6 +225,8 @@ simde_x_mm_deinterleaveeven_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vuzp1q_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickev_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); #else @@ -238,6 +252,8 @@ simde_x_mm_deinterleaveodd_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vuzp2q_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpickod_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3); #else @@ -266,6 +282,10 @@ simde_mm_addsub_pd (simde__m128d a, simde__m128d b) { float64x2_t rs = vsubq_f64(a_.neon_f64, b_.neon_f64); float64x2_t ra = vaddq_f64(a_.neon_f64, b_.neon_f64); return vcombine_f64(vget_low_f64(rs), vget_high_f64(ra)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128d temp_ra = __lsx_vfadd_d(a_.lsx_f64, b_.lsx_f64); + __m128d temp_rs = __lsx_vfsub_d(a_.lsx_f64, b_.lsx_f64); + return (__m128d)__lsx_vextrins_d((__m128i)temp_ra, (__m128i)temp_rs, 0); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64 - b_.f64, a_.f64 + b_.f64, 0, 3); #else @@ -297,6 +317,11 @@ simde_mm_addsub_ps (simde__m128 a, simde__m128 b) { float32x4_t rs = vsubq_f32(a_.neon_f32, b_.neon_f32); float32x4_t ra = vaddq_f32(a_.neon_f32, b_.neon_f32); return vtrn2q_f32(vreinterpretq_f32_s32(vrev64q_s32(vreinterpretq_s32_f32(rs))), ra); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128 temp_ra = __lsx_vfadd_s(a_.lsx_f32, b_.lsx_f32); + __m128 temp_rs = __lsx_vfsub_s(a_.lsx_f32, b_.lsx_f32); + __m128i temp = __lsx_vextrins_w((__m128i)temp_ra, (__m128i)temp_rs, 0); + r_.lsx_i64 = __lsx_vextrins_w(temp, (__m128i)temp_rs, 0b00100010); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32 - b_.f32, a_.f32 + b_.f32, 0, 5, 2, 7); #else @@ -385,6 +410,8 @@ simde_mm_lddqu_si128 (simde__m128i const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -408,6 +435,8 @@ simde_mm_loaddup_pd (simde_float64 const* mem_addr) { r_.neon_f64 = vdupq_n_f64(*mem_addr); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vdupq_n_s64(*HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vldrepl_d(mem_addr, 0); #else r_.f64[0] = *mem_addr; r_.f64[1] = *mem_addr; @@ -434,6 +463,8 @@ simde_mm_movedup_pd (simde__m128d a) { r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplvei_d(a_.lsx_i64, 0); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0); #else @@ -462,6 +493,8 @@ simde_mm_movehdup_ps (simde__m128 a) { r_.neon_f32 = vtrn2q_f32(a_.neon_f32, a_.neon_f32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 1, 3, 3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpackod_w(a_.lsx_i64, a_.lsx_i64); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 1, 1, 3, 3); #else @@ -492,6 +525,8 @@ simde_mm_moveldup_ps (simde__m128 a) { r_.neon_f32 = vtrn1q_f32(a_.neon_f32, a_.neon_f32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 2, 2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vpackev_w(a_.lsx_i64, a_.lsx_i64); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 2, 2); #else From 8787efdb98227ea057e5e48f079c79e764ce3eb4 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 10 Dec 2024 20:10:37 +0800 Subject: [PATCH 3/5] x86 ssse3: add loongarch lsx optimized implementations --- simde/x86/ssse3.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/simde/x86/ssse3.h b/simde/x86/ssse3.h index 6c4c12d5f..db60c2fb5 100644 --- a/simde/x86/ssse3.h +++ b/simde/x86/ssse3.h @@ -51,6 +51,8 @@ simde_mm_abs_epi8 (simde__m128i a) { r_.altivec_i8 = vec_abs(a_.altivec_i8); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_abs(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vabsd_b(a_.lsx_i64, __lsx_vreplgr2vr_b(0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -83,6 +85,8 @@ simde_mm_abs_epi16 (simde__m128i a) { r_.altivec_i16 = vec_abs(a_.altivec_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_abs(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vabsd_h(a_.lsx_i64, __lsx_vreplgr2vr_h(0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -116,6 +120,8 @@ simde_mm_abs_epi32 (simde__m128i a) { r_.altivec_i32 = vec_abs(a_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_abs(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vabsd_w(a_.lsx_i64, __lsx_vreplgr2vr_w(0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -251,6 +257,24 @@ simde_mm_alignr_epi8 (simde__m128i a, simde__m128i b, int count) ((count) > 15) \ ? (simde__m128i_from_neon_i8(vextq_s8(simde__m128i_to_neon_i8(a), vdupq_n_s8(0), (count) & 15))) \ : (simde__m128i_from_neon_i8(vextq_s8(simde__m128i_to_neon_i8(b), simde__m128i_to_neon_i8(a), ((count) & 15)))))) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_alignr_epi8(a, b, count) \ + ({ \ + __m128i res_; \ + if (count > 31) { \ + res_ = __lsx_vreplgr2vr_b(0); \ + } \ + else if (count > 15) { \ + res_ = __lsx_vbsrl_v(a, ((count)&15)); \ + } \ + else if (count == 0) { \ + res_ = b; \ + } \ + else { \ + res_ = __lsx_vor_v(__lsx_vbsll_v(a, (16-((count)&15))), __lsx_vbsrl_v(b, ((count)&15))); \ + } \ + (simde__m128i)res_; \ + }) #endif #if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) #define _mm_alignr_epi8(a, b, count) simde_mm_alignr_epi8(a, b, count) @@ -337,6 +361,10 @@ simde_mm_shuffle_epi8 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_swizzle( a_.wasm_v128, wasm_v128_and(b_.wasm_v128, wasm_i8x16_splat(0x8F))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i b1_ = __lsx_vslti_b(b_.lsx_i64, 0); + r_.lsx_i64 = __lsx_vshuf_b(a_.lsx_i64, a_.lsx_i64, __lsx_vandi_b(b_.lsx_i64, 15)); + r_.lsx_i64 = __lsx_vand_v(r_.lsx_i64, __lsx_vnor_v(b1_, b1_)); #else for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { r_.i8[i] = a_.i8[b_.i8[i] & 15] & (~(b_.i8[i]) >> 7); @@ -689,6 +717,10 @@ simde_mm_maddubs_epi16 (simde__m128i a, simde__m128i b) { /* saturated add */ r_.neon_i16 = vqaddq_s16(prod1, prod2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_ev = __lsx_vmulwev_h_bu_b(a_.lsx_i64, b_.lsx_i64); + __m128i temp_od = __lsx_vmulwod_h_bu_b(a_.lsx_i64, b_.lsx_i64); + r_.lsx_i64 = __lsx_vsadd_h(temp_ev, temp_od); #else for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { const int idx = HEDLEY_STATIC_CAST(int, i) << 1; @@ -775,6 +807,12 @@ simde_mm_mulhrs_epi16 (simde__m128i a, simde__m128i b) { __lo = wasm_i32x4_add(__lo, __lo); __hi = wasm_i32x4_add(__hi, __hi); r_.wasm_v128 = wasm_i16x8_shuffle(__lo, __hi, 1, 3, 5, 7, 9, 11, 13, 15); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_ev = __lsx_vmulwev_w_h(a_.lsx_i64, b_.lsx_i64); + __m128i temp_od = __lsx_vmulwod_w_h(a_.lsx_i64, b_.lsx_i64); + __m128i temp1 = __lsx_vilvl_w(temp_od, temp_ev); + __m128i temp2 = __lsx_vilvh_w(temp_od, temp_ev); + r_.lsx_i64 = __lsx_vssrarni_h_w(temp2, temp1, 15); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -850,6 +888,8 @@ simde_mm_sign_epi8 (simde__m128i a, simde__m128i b) { simde__m128i mask = wasm_i8x16_shr(b_.wasm_v128, 7); simde__m128i zeromask = simde_mm_cmpeq_epi8(b_.wasm_v128, simde_mm_setzero_si128()); r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi8(a_.wasm_v128, mask), mask)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsigncov_b(b_.lsx_i64, a_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -890,6 +930,8 @@ simde_mm_sign_epi16 (simde__m128i a, simde__m128i b) { simde__m128i mask = simde_mm_srai_epi16(b_.wasm_v128, 15); simde__m128i zeromask = simde_mm_cmpeq_epi16(b_.wasm_v128, simde_mm_setzero_si128()); r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi16(a_.wasm_v128, mask), mask)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsigncov_h(b_.lsx_i64, a_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -930,6 +972,8 @@ simde_mm_sign_epi32 (simde__m128i a, simde__m128i b) { simde__m128i mask = simde_mm_srai_epi32(b_.wasm_v128, 31); simde__m128i zeromask = simde_mm_cmpeq_epi32(b_.wasm_v128, simde_mm_setzero_si128()); r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi32(a_.wasm_v128, mask), mask)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsigncov_w(b_.lsx_i64, a_.lsx_i64); #else for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { r_.i32[i] = (b_.i32[i] < 0) ? (- a_.i32[i]) : ((b_.i32[i] != 0) ? (a_.i32[i]) : INT32_C(0)); From b7fcc18285263a123821473e74d682300e46d5d0 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 11 Dec 2024 13:11:21 +0800 Subject: [PATCH 4/5] x86 sse4.1: add loongarch lsx optimized implementations --- simde/x86/sse4.1.h | 349 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 293 insertions(+), 56 deletions(-) diff --git a/simde/x86/sse4.1.h b/simde/x86/sse4.1.h index 15a197b95..ba2bf1869 100644 --- a/simde/x86/sse4.1.h +++ b/simde/x86/sse4.1.h @@ -47,10 +47,17 @@ simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8) a_ = simde__m128i_to_private(a), b_ = simde__m128i_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i]; - } +#if defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i mask = simde_mm_setr_epi16((imm8>>0)&1, (imm8>>1)&1, (imm8>>2)&1, (imm8>>3)&1, + (imm8>>4)&1, (imm8>>5)&1, (imm8>>6)&1, (imm8>>7)&1); + mask = __lsx_vseqi_h(mask, 1); + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask); +#else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i]; + } +#endif return simde__m128i_from_private(r_); } @@ -96,10 +103,16 @@ simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8) a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i]; - } +#if defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i mask = simde_mm_setr_epi32((imm8>>0)&1, (imm8>>1)&1, (imm8>>2)&1, (imm8>>3)&1); + mask = __lsx_vseqi_w(mask, 1); + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask); +#else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i]; + } +#endif return simde__m128d_from_private(r_); } #if defined(SIMDE_X86_SSE4_1_NATIVE) @@ -138,10 +151,16 @@ simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8) a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i]; - } +#if defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i mask = simde_mm_setr_epi32((imm8>>0)&1, (imm8>>1)&1, (imm8>>2)&1, (imm8>>3)&1); + mask = __lsx_vseqi_w(mask, 1); + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask); +#else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i]; + } +#endif return simde__m128_from_private(r_); } #if defined(SIMDE_X86_SSE4_1_NATIVE) @@ -197,6 +216,8 @@ simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) { r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128, m); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, __lsx_vslti_b(mask_.lsx_i64, 0)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */ #if defined(HEDLEY_INTEL_VERSION_CHECK) @@ -241,6 +262,8 @@ simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) { r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, __lsx_vslti_h(mask_.lsx_i64, 0)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) #if defined(HEDLEY_INTEL_VERSION_CHECK) __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 }; @@ -282,6 +305,8 @@ simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) { r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m)); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, __lsx_vslti_w(mask_.lsx_i64, 0)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) #if defined(HEDLEY_INTEL_VERSION_CHECK) __typeof__(mask_.i32) z = { 0, 0, 0, 0 }; @@ -326,6 +351,8 @@ simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) { #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(signed long long) selector = vec_sra(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63))); r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), selector)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, __lsx_vslti_d(mask_.lsx_i64, 0)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) #if defined(HEDLEY_INTEL_VERSION_CHECK) __typeof__(mask_.i64) z = { 0, 0 }; @@ -355,6 +382,10 @@ simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t m_ = wasm_i64x2_shr(HEDLEY_REINTERPRET_CAST(v128_t, mask), 63); return simde__m128d_from_wasm_v128(wasm_v128_bitselect(simde__m128d_to_wasm_v128(b), simde__m128d_to_wasm_v128(a), m_)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_m = __lsx_vfcmp_clt_d(simde__m128d_to_private(mask).lsx_f64, (__m128d)__lsx_vreplgr2vr_w(0)); + __m128i r = __lsx_vbitsel_v(simde__m128d_to_private(a).lsx_i64, simde__m128d_to_private(b).lsx_i64, temp_m); + return (simde__m128d)r; #else return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask))); #endif @@ -372,6 +403,10 @@ simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t m_ = wasm_i32x4_shr(HEDLEY_REINTERPRET_CAST(v128_t, mask), 31); return simde__m128d_from_wasm_v128(wasm_v128_bitselect(simde__m128d_to_wasm_v128(b), simde__m128d_to_wasm_v128(a), m_)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_m = __lsx_vfcmp_clt_s(simde__m128_to_private(mask).lsx_f32, (__m128)__lsx_vreplgr2vr_w(0)); + __m128i r = __lsx_vbitsel_v(simde__m128_to_private(a).lsx_i64, simde__m128_to_private(b).lsx_i64, temp_m); + return (simde__m128)r; #else return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask))); #endif @@ -403,6 +438,8 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.neon_f64 = vrndiq_f64(a_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); #elif defined(simde_math_nearbyint) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -420,6 +457,8 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.neon_f64 = vrndaq_f64(a_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); #elif defined(simde_math_roundeven) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -437,6 +476,8 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.neon_f64 = vrndmq_f64(a_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_floor(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrm_d(a_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -452,6 +493,8 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.neon_f64 = vrndpq_f64(a_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_ceil(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrp_d(a_.lsx_f64); #elif defined(simde_math_ceil) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -469,6 +512,8 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.neon_f64 = vrndq_f64(a_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_trunc(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrz_d(a_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -528,7 +573,9 @@ simde_mm_ceil_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(simde_math_ceilf) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfrintrp_d(b_.lsx_f64), 0); + #elif defined(simde_math_ceilf) r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0]))); #else HEDLEY_UNREACHABLE(); @@ -557,7 +604,9 @@ simde_mm_ceil_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(simde_math_ceilf) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, (__m128i)__lsx_vfrintrp_s(b_.lsx_f32), 0); + #elif defined(simde_math_ceilf) r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0]))); #else HEDLEY_UNREACHABLE(); @@ -593,6 +642,8 @@ simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) { r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 == b_.i64); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpeq(a_.altivec_i64, b_.altivec_i64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vseq_d(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { @@ -626,6 +677,8 @@ simde_mm_cvtepi8_epi16 (simde__m128i a) { r_.neon_i16 = s16x8; #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_extend_low_i8x16(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_h_b(a_.lsx_i64, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, -1, 0, -1, 1, -1, 2, -1, 3, @@ -669,6 +722,9 @@ simde_mm_cvtepi8_epi32 (simde__m128i a) { r_.neon_i32 = s32x4; #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(a_.wasm_v128)); + + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_w_h(__lsx_vsllwil_h_b(a_.lsx_i64, 0), 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, -1, -1, -1, 0, -1, -1, -1, 1, @@ -709,6 +765,8 @@ simde_mm_cvtepi8_epi64 (simde__m128i a) { v128_t extra = wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(a_.wasm_v128)); v128_t sign = wasm_i32x4_gt(wasm_i64x2_const(0, 0), extra); r_.wasm_v128 = wasm_i32x4_shuffle(extra, sign, 0, 4, 1, 5); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_d_w(__lsx_vsllwil_w_h(__lsx_vsllwil_h_b(a_.lsx_i64, 0), 0), 0); #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) /* Disabled on x86 due to lack of 64-bit arithmetic shift until * until AVX-512 (at which point we would be using the native @@ -750,6 +808,8 @@ simde_mm_cvtepu8_epi16 (simde__m128i a) { r_.neon_u16 = u16x8; #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u16x8_extend_low_u8x16(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_hu_bu(a_.lsx_i64, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.i8) z = { 0, }; r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, @@ -799,6 +859,8 @@ simde_mm_cvtepu8_epi32 (simde__m128i a) { r_.neon_u32 = u32x4; #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(a_.wasm_v128)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_wu_hu(__lsx_vsllwil_hu_bu(a_.lsx_i64, 0), 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.i8) z = { 0, }; r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, @@ -845,6 +907,8 @@ simde_mm_cvtepu8_epi64 (simde__m128i a) { uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ r_.neon_u64 = u64x2; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_du_wu(__lsx_vsllwil_wu_hu(__lsx_vsllwil_hu_bu(a_.lsx_i64, 0), 0), 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.i8) z = { 0, }; r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, @@ -881,6 +945,8 @@ simde_mm_cvtepi16_epi32 (simde__m128i a) { r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_extend_low_i16x8(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_w_h(a_.lsx_i64, 0); #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 0, 10, 1, 12, 2, 14, 3)); r_.i32 >>= 16; @@ -915,6 +981,8 @@ simde_mm_cvtepu16_epi32 (simde__m128i a) { r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u32x4_extend_low_u16x8(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_wu_hu(a_.lsx_i64, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.u16) z = { 0, }; r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z, @@ -954,6 +1022,8 @@ simde_mm_cvtepu16_epi64 (simde__m128i a) { uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ r_.neon_u64 = u64x2; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_du_wu(__lsx_vsllwil_wu_hu(a_.lsx_i64, 0), 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.u16) z = { 0, }; r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z, @@ -989,6 +1059,8 @@ simde_mm_cvtepi16_epi64 (simde__m128i a) { int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ r_.neon_i64 = s64x2; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_d_w(__lsx_vsllwil_w_h(a_.lsx_i64, 0), 0); #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 9, 10, 0, @@ -1026,6 +1098,8 @@ simde_mm_cvtepi32_epi64 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_d_w(a_.lsx_i64, 0); #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, a_.i32, -1, 0, -1, 1)); r_.i64 >>= 32; @@ -1060,6 +1134,8 @@ simde_mm_cvtepu32_epi64 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsllwil_du_wu(a_.lsx_i64, 0); #elif defined(SIMDE_VECTOR_SCALAR) && defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.u32) z = { 0, }; r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 1, 6)); @@ -1119,6 +1195,36 @@ simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8) } break; } + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128d tmp = __lsx_vfmul_d(a_.lsx_f64, b_.lsx_f64); + + switch (imm8) { + case 0xff: + r_.lsx_f64 = __lsx_vfadd_d(tmp, (__m128d)__lsx_vshuf4i_d((__m128i)tmp, (__m128i)tmp, 0b0001)); + break; + case 0x13: + r_.lsx_i64 = __lsx_vilvl_d((__m128i)tmp, (__m128i)tmp); + break; + default: + { + uint64_t mask_data[] = { + (imm8 & (1 << 4)) ? ~UINT64_C(0) : UINT64_C(0), + (imm8 & (1 << 5)) ? ~UINT64_C(0) : UINT64_C(0), + }; + r_.lsx_i64 = __lsx_vand_v(__lsx_vld(mask_data, 0), (__m128i)tmp); + } + + r_.lsx_f64 = __lsx_vfadd_d(r_.lsx_f64, (__m128d)__lsx_vshuf4i_d((__m128i)r_.lsx_f64, (__m128i)r_.lsx_f64, 0b0001)); + + { + uint64_t mask_data[] = { + (imm8 & 1) ? ~UINT64_C(0) : UINT64_C(0), + (imm8 & 2) ? ~UINT64_C(0) : UINT64_C(0) + }; + r_.lsx_i64 = __lsx_vand_v(__lsx_vld(mask_data, 0), r_.lsx_i64); + } + break; + } #else simde_float64 sum = SIMDE_FLOAT64_C(0.0); @@ -1189,6 +1295,52 @@ simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8) } break; } + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) + __m128 tmp = __lsx_vfmul_s(a_.lsx_f32, b_.lsx_f32); + + switch (imm8) { + case 0xff: + { + __m128i tmp1 = __lsx_vilvh_d((__m128i)tmp, (__m128i)tmp); + __m128 tmp2 = __lsx_vfadd_s((__m128)tmp1, (__m128)tmp); + r_.lsx_f32 = __lsx_vfadd_s((__m128)__lsx_vreplvei_w(tmp2, 0), (__m128)__lsx_vreplvei_w(tmp2, 1)); + break; + } + case 0x7f: + { + __m128i tmp0 = __lsx_vinsgr2vr_w(tmp, 0, 3); + __m128i tmp1 = __lsx_vilvh_d((__m128i)tmp0, (__m128i)tmp0); + __m128 tmp2 = __lsx_vfadd_s((__m128)tmp1, (__m128)tmp); + r_.lsx_f32 = __lsx_vfadd_s((__m128)__lsx_vreplvei_w(tmp2, 0), (__m128)__lsx_vreplvei_w(tmp2, 1)); + break; + } + default: + { + { + uint32_t mask_data[] = { + (imm8 & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0) + }; + r_.lsx_i64 = __lsx_vand_v(__lsx_vld(mask_data, 0), (__m128i)tmp); + } + + __m128i tmp1 = __lsx_vilvh_d(r_.lsx_i64, r_.lsx_i64); + __m128 tmp2 = __lsx_vfadd_s((__m128)tmp1, r_.lsx_f32); + r_.lsx_f32 = __lsx_vfadd_s((__m128)__lsx_vreplvei_w(tmp2, 0), (__m128)__lsx_vreplvei_w(tmp2, 1)); + { + uint32_t mask_data[] = { + (imm8 & 1) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & 2) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & 4) ? ~UINT32_C(0) : UINT32_C(0), + (imm8 & 8) ? ~UINT32_C(0) : UINT32_C(0) + }; + r_.lsx_i64 = __lsx_vand_v(__lsx_vld(mask_data, 0), (__m128i)r_.lsx_f32); + } + } + break; + } #else simde_float32 sum = SIMDE_FLOAT32_C(0.0); @@ -1247,6 +1399,8 @@ simde_mm_extract_epi8 (simde__m128i a, const int imm8) # define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_neon_i8(a), imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) # define simde_mm_extract_epi8(a, imm8) wasm_u8x16_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 15) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_extract_epi8(a, imm8) __lsx_vpickve2gr_b(simde__m128i_to_lsx_i8(a), imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_extract_epi8 @@ -1281,6 +1435,8 @@ simde_mm_extract_epi32 (simde__m128i a, const int imm8) # define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_altivec_i32(a), imm8)) #elif defined(SIMDE_WASM_SIMD128_NATIVE) # define simde_mm_extract_epi32(a, imm8) wasm_i32x4_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 3) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_extract_epi32(a, imm8) __lsx_vpickve2gr_w(simde__m128i_to_lsx_i32(a), imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_extract_epi32 @@ -1313,6 +1469,8 @@ simde_mm_extract_epi64 (simde__m128i a, const int imm8) # define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_neon_i64(a), imm8) #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) # define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_altivec_i64(a), imm8)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_extract_epi64(a, imm8) __lsx_vpickve2gr_d(simde__m128i_to_lsx_i64(a), imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) #undef _mm_extract_epi64 @@ -1337,6 +1495,8 @@ simde_mm_extract_ps (simde__m128 a, const int imm8) #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_neon_i32(a), imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_extract_ps(a, imm8) wasm_i32x4_extract_lane(simde__m128_to_wasm_v128((a)), (imm8) & 3) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_extract_ps(a, imm8) __lsx_vpickve2gr_w(simde__m128_to_lsx_i32(a), imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_extract_ps @@ -1380,7 +1540,9 @@ simde_mm_floor_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(simde_math_floor) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfrintrm_d(b_.lsx_f64), 0); + #elif defined(simde_math_floor) r_.f64[0] = simde_math_floor(b_.f64[0]); r_.f64[1] = a_.f64[1]; #else @@ -1410,7 +1572,9 @@ simde_mm_floor_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(simde_math_floorf) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i32, (__m128i)__lsx_vfrintrm_s(b_.lsx_f32), 0); + #elif defined(simde_math_floorf) r_.f32[0] = simde_math_floorf(b_.f32[0]); for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = a_.f32[i]; @@ -1451,6 +1615,8 @@ simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8) # define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_neon_i8(a), imm8)) #elif defined(SIMDE_WASM_SIMD128_NATIVE) # define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i8x16_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 15, HEDLEY_STATIC_CAST(int8_t, (i)))) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_insert_epi8(a, i, imm8) __lsx_vinsgr2vr_b(a, i, imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_insert_epi8 @@ -1478,6 +1644,8 @@ simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8) # define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_neon_i32(a), imm8)) #elif defined(SIMDE_WASM_SIMD128_NATIVE) # define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i32x4_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 3, (i))) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_insert_epi32(a, i, imm8) __lsx_vinsgr2vr_w(a, i, imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_insert_epi32 @@ -1517,6 +1685,8 @@ simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8) # define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_neon_i64(a), imm8)) #elif defined(SIMDE_WASM_SIMD128_NATIVE) # define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i64x2_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 1, (i))) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) +# define simde_mm_insert_epi64(a, i, imm8) __lsx_vinsgr2vr_d(a, i, imm8) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) #undef _mm_insert_epi64 @@ -1535,11 +1705,16 @@ simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8) float tmp1_ = b_.f32[(imm8 >> 6) & 3]; a_.f32[(imm8 >> 4) & 3] = tmp1_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1 ) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i]; - } - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i mask = simde_mm_setr_epi32((imm8>>0)&1, (imm8>>1)&1, (imm8>>2)&1, (imm8>>3)&1); + mask = __lsx_vseqi_w(mask, 1); + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, __lsx_vreplgr2vr_w(0), mask); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = ((imm8 >> i) & 1 ) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i]; + } + #endif return simde__m128_from_private(r_); } #if defined(SIMDE_X86_SSE4_1_NATIVE) @@ -1570,6 +1745,8 @@ simde_mm_max_epi8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i8x16_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_b(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -1605,6 +1782,8 @@ simde_mm_max_epi32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i32x4_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_w(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -1640,6 +1819,8 @@ simde_mm_max_epu16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u16x8_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_hu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -1672,6 +1853,8 @@ simde_mm_max_epu32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u32x4_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_wu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -1704,6 +1887,8 @@ simde_mm_min_epi8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i8x16_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = vec_min(a_.altivec_i8, b_.altivec_i8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_b(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -1736,6 +1921,8 @@ simde_mm_min_epi32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i32x4_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_min(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_w(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -1771,6 +1958,8 @@ simde_mm_min_epu16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u16x8_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u16 = vec_min(a_.altivec_u16, b_.altivec_u16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_hu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -1803,6 +1992,8 @@ simde_mm_min_epu32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u32x4_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u32 = vec_min(a_.altivec_u32, b_.altivec_u32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_wu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -1898,6 +2089,8 @@ simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i64x2_make( wasm_i32x4_extract_lane(a_.wasm_v128, 0) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 0)), wasm_i32x4_extract_lane(a_.wasm_v128, 2) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 2))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmulwev_d_w(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { @@ -1934,6 +2127,8 @@ simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_mul(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmul_w(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -1959,6 +2154,8 @@ simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vmulq_u32(a_.neon_u32, b_.neon_u32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmul_w(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.u32 = a_.u32 * b_.u32; #else @@ -2007,6 +2204,8 @@ simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) { r_.altivec_u16 = vec_packsu(a_.altivec_i32, b_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u16x8_narrow_i32x4(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssrarni_hu_w(b_.lsx_i64, a_.lsx_i64, 0); #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) int32_t v SIMDE_VECTOR(32) = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 1, 2, 3, 4, 5, 6, 7); @@ -2039,30 +2238,38 @@ simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding) b_ = simde__m128d_to_private(b); switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyint) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - case SIMDE_MM_FROUND_CUR_DIRECTION: + case SIMDE_MM_FROUND_TO_NEAREST_INT: + case SIMDE_MM_FROUND_CUR_DIRECTION: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(r_.lsx_i64, (__m128i)__lsx_vfrintrne_d(b_.lsx_f64), 0); + #elif defined(simde_math_nearbyint) r_.f64[0] = simde_math_nearbyint(b_.f64[0]); - break; - #endif + #endif + break; - #if defined(simde_math_floor) - case SIMDE_MM_FROUND_TO_NEG_INF: + case SIMDE_MM_FROUND_TO_NEG_INF: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(r_.lsx_i64, (__m128i)__lsx_vfrintrm_d(b_.lsx_f64), 0); + #elif defined(simde_math_floor) r_.f64[0] = simde_math_floor(b_.f64[0]); - break; - #endif + #endif + break; - #if defined(simde_math_ceil) - case SIMDE_MM_FROUND_TO_POS_INF: + case SIMDE_MM_FROUND_TO_POS_INF: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(r_.lsx_i64, (__m128i)__lsx_vfrintrp_d(b_.lsx_f64), 0); + #elif defined(simde_math_ceil) r_.f64[0] = simde_math_ceil(b_.f64[0]); - break; - #endif + #endif + break; - #if defined(simde_math_trunc) - case SIMDE_MM_FROUND_TO_ZERO: + case SIMDE_MM_FROUND_TO_ZERO: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(r_.lsx_i64, (__m128i)__lsx_vfrintrz_d(b_.lsx_f64), 0); + #elif defined(simde_math_trunc) r_.f64[0] = simde_math_trunc(b_.f64[0]); - break; - #endif + #endif + break; default: HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); @@ -2091,30 +2298,38 @@ simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding) b_ = simde__m128_to_private(b); switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyintf) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - case SIMDE_MM_FROUND_CUR_DIRECTION: + case SIMDE_MM_FROUND_TO_NEAREST_INT: + case SIMDE_MM_FROUND_CUR_DIRECTION: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(r_.lsx_i64, (__m128i)__lsx_vfrintrne_s(b_.lsx_f32), 0); + #elif defined(simde_math_nearbyintf) r_.f32[0] = simde_math_nearbyintf(b_.f32[0]); - break; - #endif + #endif + break; - #if defined(simde_math_floorf) - case SIMDE_MM_FROUND_TO_NEG_INF: + case SIMDE_MM_FROUND_TO_NEG_INF: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(r_.lsx_i64, (__m128i)__lsx_vfrintrm_s(b_.lsx_f32), 0); + #elif defined(simde_math_floorf) r_.f32[0] = simde_math_floorf(b_.f32[0]); - break; - #endif + #endif + break; - #if defined(simde_math_ceilf) - case SIMDE_MM_FROUND_TO_POS_INF: + case SIMDE_MM_FROUND_TO_POS_INF: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(r_.lsx_i64, (__m128i)__lsx_vfrintrp_s(b_.lsx_f32), 0); + #elif defined(simde_math_ceilf) r_.f32[0] = simde_math_ceilf(b_.f32[0]); - break; - #endif + #endif + break; - #if defined(simde_math_truncf) - case SIMDE_MM_FROUND_TO_ZERO: + case SIMDE_MM_FROUND_TO_ZERO: + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(r_.lsx_i64, (__m128i)__lsx_vfrintrz_s(b_.lsx_f32), 0); + #elif defined(simde_math_truncf) r_.f32[0] = simde_math_truncf(b_.f32[0]); - break; - #endif + #endif + break; default: HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); @@ -2142,7 +2357,7 @@ simde_mm_stream_load_si128 (const simde__m128i* mem_addr) { #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_load) && ( \ defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) return __builtin_nontemporal_load(mem_addr); #else return simde_mm_load_si128(mem_addr); @@ -2168,6 +2383,8 @@ simde_mm_test_all_ones (simde__m128i a) { r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r = HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(a_.wasm_v128, 0) & wasm_i64x2_extract_lane(a_.wasm_v128, 1)) == 0xFFFFFFFFFFFFFFFFull; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = ((__lsx_vpickve2gr_d(a_.lsx_i64, 0) & __lsx_vpickve2gr_d(a_.lsx_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0)); #else int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0); @@ -2202,6 +2419,8 @@ simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) { r = !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r = (wasm_i64x2_extract_lane(tmp_.wasm_v128, 0) | wasm_i64x2_extract_lane(tmp_.wasm_v128, 1)) == 0; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !(__lsx_vpickve2gr_d(tmp_.lsx_i64, 0) | __lsx_vpickve2gr_d(tmp_.lsx_i64, 1)); #else int_fast32_t r_ = HEDLEY_STATIC_CAST(int_fast32_t, 0); @@ -2242,6 +2461,13 @@ simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) { long long ones = c0 | c1; long long zeros = ~(c0 & c1); return ones && zeros; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i tmp = __lsx_vand_v(a_.lsx_i64, mask_.lsx_i64); + long long tmp0 = __lsx_vpickve2gr_d(tmp, 0); + long long tmp1 = __lsx_vpickve2gr_d(tmp, 1); + long long ones = tmp0 | tmp1; + long long zeros = ~(tmp0 & tmp1); + return ones && zeros; #else for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0)) @@ -2272,6 +2498,9 @@ simde_mm_testc_si128 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t m = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); return (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) == 0; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i tmp = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); + return (__lsx_vpickve2gr_d(tmp, 0) | __lsx_vpickve2gr_d(tmp, 1)) == 0; #else int_fast32_t r = 0; @@ -2309,6 +2538,11 @@ simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) { v128_t m2 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); return (wasm_i64x2_extract_lane(m1, 0) | wasm_i64x2_extract_lane(m1, 1)) \ && (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i m1 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); + __m128i m2 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); + return (__lsx_vpickve2gr_d(m1, 0) | __lsx_vpickve2gr_d(m1, 1)) \ + && (__lsx_vpickve2gr_d(m2, 0) | __lsx_vpickve2gr_d(m2, 1)); #else for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0)) @@ -2340,6 +2574,9 @@ simde_mm_testz_si128 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t m = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); return (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) == 0; + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i tmp = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); + return !(__lsx_vpickve2gr_d(tmp, 0) | __lsx_vpickve2gr_d(tmp, 1)); #elif defined(SIMDE_HAVE_INT128_) if ((a_.u128[0] & b_.u128[0]) == 0) { return 1; From 656c2a27aa2da59fc55e5c9700277beb7b5da971 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 11 Dec 2024 13:15:24 +0800 Subject: [PATCH 5/5] x86 sse4.2: add loongarch lsx optimized implementations --- simde/x86/sse4.2.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/simde/x86/sse4.2.h b/simde/x86/sse4.2.h index c57e28e6a..a0723952c 100644 --- a/simde/x86/sse4.2.h +++ b/simde/x86/sse4.2.h @@ -175,6 +175,8 @@ simde_mm_cmpgt_epi64 (simde__m128i a, simde__m128i b) { r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_cmpgt(a_.altivec_i64, b_.altivec_i64)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_gt(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64); #else @@ -298,6 +300,8 @@ simde_mm_crc32_u8(uint32_t prevcrc, uint8_t v) { #else #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) return __crc32cb(prevcrc, v); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __builtin_loongarch_crcc_w_b_w(v, prevcrc); #else uint32_t crc = prevcrc; crc ^= v; @@ -331,6 +335,8 @@ simde_mm_crc32_u16(uint32_t prevcrc, uint16_t v) { #else #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) return __crc32ch(prevcrc, v); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __builtin_loongarch_crcc_w_h_w(v, prevcrc); #else uint32_t crc = prevcrc; crc = simde_mm_crc32_u8(crc, v & 0xff); @@ -351,6 +357,8 @@ simde_mm_crc32_u32(uint32_t prevcrc, uint32_t v) { #else #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRC32) return __crc32cw(prevcrc, v); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __builtin_loongarch_crcc_w_w_w(v, prevcrc); #else uint32_t crc = prevcrc; crc = simde_mm_crc32_u16(crc, v & 0xffff);