diff --git a/simde/x86/avx.h b/simde/x86/avx.h index bd74f8677..e13b4cc20 100644 --- a/simde/x86/avx.h +++ b/simde/x86/avx.h @@ -1697,22 +1697,16 @@ simde__m256 simde_mm256_addsub_ps (simde__m256 a, simde__m256 b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_addsub_ps(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256 add_ = __lasx_xvfadd_s(a, b), sub_ = __lasx_xvfsub_s(a, b); + return (simde__m256)__lasx_xvextrins_w(__lasx_xvextrins_w(sub_, add_, 0x11), add_, 0x33); #else simde__m256_private r_, a_ = simde__m256_to_private(a), b_ = simde__m256_to_private(b); - #if defined(SIMDE_LOONGARCH_LASX_NATIVE) - simde__m256_private aev_, aod_, bev_, bod_; - aev_.i256 = __lasx_xvpickev_w(a_.i256, a_.i256); - aod_.i256 = __lasx_xvpickod_w(a_.i256, a_.i256); - bev_.i256 = __lasx_xvpickev_w(b_.i256, b_.i256); - bod_.i256 = __lasx_xvpickod_w(b_.i256, b_.i256); - aev_.f256 = __lasx_xvfsub_s(aev_.f256, bev_.f256); - aod_.f256 = __lasx_xvfadd_s(aod_.f256, bod_.f256); - r_.i256 = __lasx_xvilvl_w(aod_.i256, aev_.i256); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_mm_addsub_ps(a_.m128[0], b_.m128[0]); r_.m128[1] = simde_mm_addsub_ps(a_.m128[1], b_.m128[1]); #else @@ -1736,22 +1730,16 @@ simde__m256d simde_mm256_addsub_pd (simde__m256d a, simde__m256d b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_addsub_pd(a, b); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + __m256d add_ = __lasx_xvfadd_d(a, b), sub_ = __lasx_xvfsub_d(a, b); + return (simde__m256d)__lasx_xvextrins_d(__lasx_xvextrins_d(sub_, add_, 0x11), add_, 0x33); #else simde__m256d_private r_, a_ = simde__m256d_to_private(a), b_ = simde__m256d_to_private(b); - #if defined(SIMDE_LOONGARCH_LASX_NATIVE) - simde__m256d_private aev_, aod_, bev_, bod_; - aev_.i256 = __lasx_xvpickev_d(a_.i256, a_.i256); - aod_.i256 = __lasx_xvpickod_d(a_.i256, a_.i256); - bev_.i256 = __lasx_xvpickev_d(b_.i256, b_.i256); - bod_.i256 = __lasx_xvpickod_d(b_.i256, b_.i256); - aev_.d256 = __lasx_xvfsub_d(aev_.d256, bev_.d256); - aod_.d256 = __lasx_xvfadd_d(aod_.d256, bod_.d256); - r_.i256 = __lasx_xvilvl_d(aod_.i256, aev_.i256); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128d[0] = simde_mm_addsub_pd(a_.m128d[0], b_.m128d[0]); r_.m128d[1] = simde_mm_addsub_pd(a_.m128d[1], b_.m128d[1]); #else diff --git a/simde/x86/f16c.h b/simde/x86/f16c.h index 9522bf6f6..27828a44c 100644 --- a/simde/x86/f16c.h +++ b/simde/x86/f16c.h @@ -51,6 +51,8 @@ simde_mm_cvtps_ph(simde__m128 a, const int imm8) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) r_.neon_f16 = vcombine_f16(vcvt_f16_f32(a_.neon_f32), vdup_n_f16(SIMDE_FLOAT16_C(0.0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcvt_h_s((v4f32)__lsx_vreplgr2vr_w(0), a_.lsx_f32); #elif defined(SIMDE_FLOAT16_VECTOR) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { @@ -81,7 +83,9 @@ simde_mm_cvtph_ps(simde__m128i a) { simde__m128i_private a_ = simde__m128i_to_private(a); simde__m128_private r_; - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfcvtl_s_h(a_.lsx_i64); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) r_.neon_f32 = vcvt_f32_f16(vget_low_f16(a_.neon_f16)); #elif defined(SIMDE_FLOAT16_VECTOR) SIMDE_VECTORIZE @@ -110,7 +114,11 @@ simde_mm256_cvtps_ph(simde__m256 a, const int imm8) { HEDLEY_STATIC_CAST(void, imm8); - #if defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvfcvt_h_s(a_.f256, a_.f256); + a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); + r_.lsx_i64 = simde_mm256_extractf128_si256(a_.i256, 0); + #elif defined(SIMDE_FLOAT16_VECTOR) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { r_.f16[i] = simde_float16_from_float32(a_.f32[i]); @@ -146,7 +154,11 @@ simde_mm256_cvtph_ps(simde__m128i a) { simde__m128i_private a_ = simde__m128i_to_private(a); simde__m256_private r_; - #if defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = simde_mm256_castsi128_si256(a_.lsx_i64); + r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8); + r_.f256 = __lasx_xvfcvtl_s_h(r_.i256); + #elif defined(SIMDE_FLOAT16_VECTOR) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.f32[i] = simde_float16_to_float32(a_.f16[i]); diff --git a/simde/x86/fma.h b/simde/x86/fma.h index 1a1104f65..bb174284b 100644 --- a/simde/x86/fma.h +++ b/simde/x86/fma.h @@ -42,6 +42,8 @@ simde__m128d simde_mm_fmadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfmadd_d(a, b, c); #else simde__m128d_private a_ = simde__m128d_to_private(a), @@ -78,6 +80,8 @@ simde__m256d simde_mm256_fmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmadd_d(a, b, c); #else return simde_mm256_add_pd(simde_mm256_mul_pd(a, b), c); #endif @@ -92,6 +96,8 @@ simde__m128 simde_mm_fmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfmadd_s(a, b, c); #else simde__m128_private a_ = simde__m128_to_private(a), @@ -130,6 +136,8 @@ simde__m256 simde_mm256_fmadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmadd_s(a, b, c); #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) simde__m256_private a_ = simde__m256_to_private(a), @@ -156,6 +164,8 @@ simde__m128d simde_mm_fmadd_sd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) return _mm_fmadd_sd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128d)__lsx_vextrins_d(a, __lsx_vfmadd_d(a, b, c), 0x00); #else return simde_mm_add_sd(simde_mm_mul_sd(a, b), c); #endif @@ -170,6 +180,8 @@ simde__m128 simde_mm_fmadd_ss (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) return _mm_fmadd_ss(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128)__lsx_vextrins_w(a, __lsx_vfmadd_s(a, b, c), 0x00); #else return simde_mm_add_ss(simde_mm_mul_ss(a, b), c); #endif @@ -240,6 +252,8 @@ simde__m128d simde_mm_fmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmsub_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfmsub_d(a, b, c); #else return simde_mm_sub_pd(simde_mm_mul_pd(a, b), c); #endif @@ -254,6 +268,8 @@ simde__m256d simde_mm256_fmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmsub_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmsub_d(a, b, c); #else return simde_mm256_sub_pd(simde_mm256_mul_pd(a, b), c); #endif @@ -268,6 +284,8 @@ simde__m128 simde_mm_fmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmsub_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfmsub_s(a, b, c); #else return simde_mm_sub_ps(simde_mm_mul_ps(a, b), c); #endif @@ -282,6 +300,8 @@ simde__m256 simde_mm256_fmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmsub_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfmsub_s(a, b, c); #else return simde_mm256_sub_ps(simde_mm256_mul_ps(a, b), c); #endif @@ -324,6 +344,11 @@ simde__m128d simde_mm_fmsubadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmsubadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + a = __lsx_vfmul_d(a, b); + b = __lsx_vfsub_d(a, c); + c = __lsx_vfadd_d(a, c); + return (simde__m128d)__lsx_vextrins_d(c, b, 0x11); #else simde__m128d_private r_, @@ -350,6 +375,11 @@ simde__m256d simde_mm256_fmsubadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmsubadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + a = __lasx_xvfmul_d(a, b); + b = __lasx_xvfsub_d(a, c); + c = __lasx_xvfadd_d(a, c); + return (simde__m256d)__lasx_xvextrins_d(c, b, 0x11); #else simde__m256d_private r_, @@ -376,6 +406,11 @@ simde__m128 simde_mm_fmsubadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fmsubadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + a = __lsx_vfmul_s(a, b); + b = __lsx_vfsub_s(a, c); + c = __lsx_vfadd_s(a, c); + return (simde__m128)__lsx_vextrins_w(__lsx_vextrins_w(c, b, 0x11), b, 0x33); #else simde__m128_private r_, @@ -402,6 +437,11 @@ simde__m256 simde_mm256_fmsubadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fmsubadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + a = __lasx_xvfmul_s(a, b); + b = __lasx_xvfsub_s(a, c); + c = __lasx_xvfadd_s(a, c); + return (simde__m256)__lasx_xvextrins_w(__lasx_xvextrins_w(c, b, 0x11), b, 0x33); #else simde__m256_private r_, @@ -428,6 +468,8 @@ simde__m128d simde_mm_fnmadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fnmadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfsub_d(c, __lsx_vfmul_d(a, b)); #else simde__m128d_private r_, @@ -457,6 +499,8 @@ simde__m256d simde_mm256_fnmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fnmadd_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsub_d(c, __lasx_xvfmul_d(a, b)); #else simde__m256d_private r_, @@ -487,6 +531,8 @@ simde__m128 simde_mm_fnmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fnmadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfsub_s(c, __lsx_vfmul_s(a, b)); #else simde__m128_private r_, @@ -518,6 +564,8 @@ simde__m256 simde_mm256_fnmadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fnmadd_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsub_s(c, __lasx_xvfmul_s(a, b)); #else simde__m256_private r_, @@ -589,6 +637,8 @@ simde__m128d simde_mm_fnmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fnmsub_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfsub_d((__m128d)__lsx_vreplgr2vr_d(0), __lsx_vfmadd_d(a, b, c)); #else simde__m128d_private r_, @@ -614,6 +664,8 @@ simde__m256d simde_mm256_fnmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fnmsub_pd(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsub_d((__m256d)__lasx_xvreplgr2vr_d(0), __lasx_xvfmadd_d(a, b, c)); #else simde__m256d_private r_, @@ -639,6 +691,8 @@ simde__m128 simde_mm_fnmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm_fnmsub_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(0), __lsx_vfmadd_s(a, b, c)); #else simde__m128_private r_, @@ -664,6 +718,8 @@ simde__m256 simde_mm256_fnmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) { #if defined(SIMDE_X86_FMA_NATIVE) return _mm256_fnmsub_ps(a, b, c); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvfsub_s((__m256)__lasx_xvreplgr2vr_w(0), __lasx_xvfmadd_s(a, b, c)); #else simde__m256_private r_,