Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x86 f16c fma: add lasx optimized implementatinos #1248

Merged
merged 3 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 8 additions & 20 deletions simde/x86/avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -1697,22 +1697,16 @@ simde__m256
simde_mm256_addsub_ps (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_addsub_ps(a, b);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
__m256 add_ = __lasx_xvfadd_s(a, b), sub_ = __lasx_xvfsub_s(a, b);
return (simde__m256)__lasx_xvextrins_w(__lasx_xvextrins_w(sub_, add_, 0x11), add_, 0x33);
#else
simde__m256_private
r_,
a_ = simde__m256_to_private(a),
b_ = simde__m256_to_private(b);

#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
simde__m256_private aev_, aod_, bev_, bod_;
aev_.i256 = __lasx_xvpickev_w(a_.i256, a_.i256);
aod_.i256 = __lasx_xvpickod_w(a_.i256, a_.i256);
bev_.i256 = __lasx_xvpickev_w(b_.i256, b_.i256);
bod_.i256 = __lasx_xvpickod_w(b_.i256, b_.i256);
aev_.f256 = __lasx_xvfsub_s(aev_.f256, bev_.f256);
aod_.f256 = __lasx_xvfadd_s(aod_.f256, bod_.f256);
r_.i256 = __lasx_xvilvl_w(aod_.i256, aev_.i256);
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128[0] = simde_mm_addsub_ps(a_.m128[0], b_.m128[0]);
r_.m128[1] = simde_mm_addsub_ps(a_.m128[1], b_.m128[1]);
#else
Expand All @@ -1736,22 +1730,16 @@ simde__m256d
simde_mm256_addsub_pd (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_addsub_pd(a, b);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
__m256d add_ = __lasx_xvfadd_d(a, b), sub_ = __lasx_xvfsub_d(a, b);
return (simde__m256d)__lasx_xvextrins_d(__lasx_xvextrins_d(sub_, add_, 0x11), add_, 0x33);
#else
simde__m256d_private
r_,
a_ = simde__m256d_to_private(a),
b_ = simde__m256d_to_private(b);

#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
simde__m256d_private aev_, aod_, bev_, bod_;
aev_.i256 = __lasx_xvpickev_d(a_.i256, a_.i256);
aod_.i256 = __lasx_xvpickod_d(a_.i256, a_.i256);
bev_.i256 = __lasx_xvpickev_d(b_.i256, b_.i256);
bod_.i256 = __lasx_xvpickod_d(b_.i256, b_.i256);
aev_.d256 = __lasx_xvfsub_d(aev_.d256, bev_.d256);
aod_.d256 = __lasx_xvfadd_d(aod_.d256, bod_.d256);
r_.i256 = __lasx_xvilvl_d(aod_.i256, aev_.i256);
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r_.m128d[0] = simde_mm_addsub_pd(a_.m128d[0], b_.m128d[0]);
r_.m128d[1] = simde_mm_addsub_pd(a_.m128d[1], b_.m128d[1]);
#else
Expand Down
18 changes: 15 additions & 3 deletions simde/x86/f16c.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ simde_mm_cvtps_ph(simde__m128 a, const int imm8) {

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
r_.neon_f16 = vcombine_f16(vcvt_f16_f32(a_.neon_f32), vdup_n_f16(SIMDE_FLOAT16_C(0.0)));
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vfcvt_h_s((v4f32)__lsx_vreplgr2vr_w(0), a_.lsx_f32);
#elif defined(SIMDE_FLOAT16_VECTOR)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
Expand Down Expand Up @@ -81,7 +83,9 @@ simde_mm_cvtph_ps(simde__m128i a) {
simde__m128i_private a_ = simde__m128i_to_private(a);
simde__m128_private r_;

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_f32 = __lsx_vfcvtl_s_h(a_.lsx_i64);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
r_.neon_f32 = vcvt_f32_f16(vget_low_f16(a_.neon_f16));
#elif defined(SIMDE_FLOAT16_VECTOR)
SIMDE_VECTORIZE
Expand Down Expand Up @@ -110,7 +114,11 @@ simde_mm256_cvtps_ph(simde__m256 a, const int imm8) {

HEDLEY_STATIC_CAST(void, imm8);

#if defined(SIMDE_FLOAT16_VECTOR)
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
a_.i256 = __lasx_xvfcvt_h_s(a_.f256, a_.f256);
a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8);
r_.lsx_i64 = simde_mm256_extractf128_si256(a_.i256, 0);
#elif defined(SIMDE_FLOAT16_VECTOR)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
r_.f16[i] = simde_float16_from_float32(a_.f32[i]);
Expand Down Expand Up @@ -146,7 +154,11 @@ simde_mm256_cvtph_ps(simde__m128i a) {
simde__m128i_private a_ = simde__m128i_to_private(a);
simde__m256_private r_;

#if defined(SIMDE_FLOAT16_VECTOR)
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
r_.i256 = simde_mm256_castsi128_si256(a_.lsx_i64);
r_.i256 = __lasx_xvpermi_d(r_.i256, 0xd8);
r_.f256 = __lasx_xvfcvtl_s_h(r_.i256);
#elif defined(SIMDE_FLOAT16_VECTOR)
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_float16_to_float32(a_.f16[i]);
Expand Down
56 changes: 56 additions & 0 deletions simde/x86/fma.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ simde__m128d
simde_mm_fmadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm_fmadd_pd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
return __lsx_vfmadd_d(a, b, c);
#else
simde__m128d_private
a_ = simde__m128d_to_private(a),
Expand Down Expand Up @@ -78,6 +80,8 @@ simde__m256d
simde_mm256_fmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm256_fmadd_pd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvfmadd_d(a, b, c);
#else
return simde_mm256_add_pd(simde_mm256_mul_pd(a, b), c);
#endif
Expand All @@ -92,6 +96,8 @@ simde__m128
simde_mm_fmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm_fmadd_ps(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
return __lsx_vfmadd_s(a, b, c);
#else
simde__m128_private
a_ = simde__m128_to_private(a),
Expand Down Expand Up @@ -130,6 +136,8 @@ simde__m256
simde_mm256_fmadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm256_fmadd_ps(a, b, c);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvfmadd_s(a, b, c);
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
simde__m256_private
a_ = simde__m256_to_private(a),
Expand All @@ -156,6 +164,8 @@ simde__m128d
simde_mm_fmadd_sd (simde__m128d a, simde__m128d b, simde__m128d c) {
#if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT)
return _mm_fmadd_sd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
return (simde__m128d)__lsx_vextrins_d(a, __lsx_vfmadd_d(a, b, c), 0x00);
#else
return simde_mm_add_sd(simde_mm_mul_sd(a, b), c);
#endif
Expand All @@ -170,6 +180,8 @@ simde__m128
simde_mm_fmadd_ss (simde__m128 a, simde__m128 b, simde__m128 c) {
#if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT)
return _mm_fmadd_ss(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
return (simde__m128)__lsx_vextrins_w(a, __lsx_vfmadd_s(a, b, c), 0x00);
#else
return simde_mm_add_ss(simde_mm_mul_ss(a, b), c);
#endif
Expand Down Expand Up @@ -240,6 +252,8 @@ simde__m128d
simde_mm_fmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm_fmsub_pd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
return __lsx_vfmsub_d(a, b, c);
#else
return simde_mm_sub_pd(simde_mm_mul_pd(a, b), c);
#endif
Expand All @@ -254,6 +268,8 @@ simde__m256d
simde_mm256_fmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm256_fmsub_pd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvfmsub_d(a, b, c);
#else
return simde_mm256_sub_pd(simde_mm256_mul_pd(a, b), c);
#endif
Expand All @@ -268,6 +284,8 @@ simde__m128
simde_mm_fmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm_fmsub_ps(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
return __lsx_vfmsub_s(a, b, c);
#else
return simde_mm_sub_ps(simde_mm_mul_ps(a, b), c);
#endif
Expand All @@ -282,6 +300,8 @@ simde__m256
simde_mm256_fmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm256_fmsub_ps(a, b, c);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvfmsub_s(a, b, c);
#else
return simde_mm256_sub_ps(simde_mm256_mul_ps(a, b), c);
#endif
Expand Down Expand Up @@ -324,6 +344,11 @@ simde__m128d
simde_mm_fmsubadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm_fmsubadd_pd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
a = __lsx_vfmul_d(a, b);
b = __lsx_vfsub_d(a, c);
c = __lsx_vfadd_d(a, c);
return (simde__m128d)__lsx_vextrins_d(c, b, 0x11);
#else
simde__m128d_private
r_,
Expand All @@ -350,6 +375,11 @@ simde__m256d
simde_mm256_fmsubadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm256_fmsubadd_pd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
a = __lasx_xvfmul_d(a, b);
b = __lasx_xvfsub_d(a, c);
c = __lasx_xvfadd_d(a, c);
return (simde__m256d)__lasx_xvextrins_d(c, b, 0x11);
#else
simde__m256d_private
r_,
Expand All @@ -376,6 +406,11 @@ simde__m128
simde_mm_fmsubadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm_fmsubadd_ps(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
a = __lsx_vfmul_s(a, b);
b = __lsx_vfsub_s(a, c);
c = __lsx_vfadd_s(a, c);
return (simde__m128)__lsx_vextrins_w(__lsx_vextrins_w(c, b, 0x11), b, 0x33);
#else
simde__m128_private
r_,
Expand All @@ -402,6 +437,11 @@ simde__m256
simde_mm256_fmsubadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm256_fmsubadd_ps(a, b, c);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
a = __lasx_xvfmul_s(a, b);
b = __lasx_xvfsub_s(a, c);
c = __lasx_xvfadd_s(a, c);
return (simde__m256)__lasx_xvextrins_w(__lasx_xvextrins_w(c, b, 0x11), b, 0x33);
#else
simde__m256_private
r_,
Expand All @@ -428,6 +468,8 @@ simde__m128d
simde_mm_fnmadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm_fnmadd_pd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
return __lsx_vfsub_d(c, __lsx_vfmul_d(a, b));
#else
simde__m128d_private
r_,
Expand Down Expand Up @@ -457,6 +499,8 @@ simde__m256d
simde_mm256_fnmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm256_fnmadd_pd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvfsub_d(c, __lasx_xvfmul_d(a, b));
#else
simde__m256d_private
r_,
Expand Down Expand Up @@ -487,6 +531,8 @@ simde__m128
simde_mm_fnmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm_fnmadd_ps(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
return __lsx_vfsub_s(c, __lsx_vfmul_s(a, b));
#else
simde__m128_private
r_,
Expand Down Expand Up @@ -518,6 +564,8 @@ simde__m256
simde_mm256_fnmadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm256_fnmadd_ps(a, b, c);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvfsub_s(c, __lasx_xvfmul_s(a, b));
#else
simde__m256_private
r_,
Expand Down Expand Up @@ -589,6 +637,8 @@ simde__m128d
simde_mm_fnmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm_fnmsub_pd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
return __lsx_vfsub_d((__m128d)__lsx_vreplgr2vr_d(0), __lsx_vfmadd_d(a, b, c));
#else
simde__m128d_private
r_,
Expand All @@ -614,6 +664,8 @@ simde__m256d
simde_mm256_fnmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm256_fnmsub_pd(a, b, c);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvfsub_d((__m256d)__lasx_xvreplgr2vr_d(0), __lasx_xvfmadd_d(a, b, c));
#else
simde__m256d_private
r_,
Expand All @@ -639,6 +691,8 @@ simde__m128
simde_mm_fnmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm_fnmsub_ps(a, b, c);
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
return __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(0), __lsx_vfmadd_s(a, b, c));
#else
simde__m128_private
r_,
Expand All @@ -664,6 +718,8 @@ simde__m256
simde_mm256_fnmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) {
#if defined(SIMDE_X86_FMA_NATIVE)
return _mm256_fnmsub_ps(a, b, c);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvfsub_s((__m256)__lasx_xvreplgr2vr_w(0), __lasx_xvfmadd_s(a, b, c));
#else
simde__m256_private
r_,
Expand Down
Loading