From 5a30c2fab859acc11035daef2ed8dcf166736f55 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Tue, 26 Nov 2024 11:37:38 +0800 Subject: [PATCH] loongarch: add lsx support for sse2.h --- simde/simde-f16.h | 2 +- simde/x86/sse2.h | 667 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 621 insertions(+), 48 deletions(-) diff --git a/simde/simde-f16.h b/simde/simde-f16.h index 6171f84ee..2f7c4c513 100644 --- a/simde/simde-f16.h +++ b/simde/simde-f16.h @@ -90,7 +90,7 @@ SIMDE_BEGIN_DECLS_ #elif defined(__FLT16_MIN__) && \ (defined(__clang__) && \ (!defined(SIMDE_ARCH_AARCH64) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) \ - && !defined(SIMDE_ARCH_RISCV64)) + && !defined(SIMDE_ARCH_RISCV64) && !defined(SIMDE_ARCH_LOONGARCH)) #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16_NO_ABI #else #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_PORTABLE diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index 610d54fee..fcbb5a40d 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -139,6 +139,17 @@ typedef union { SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + v16i8 lsx_i8; + v8i16 lsx_i16; + v4i32 lsx_i32; + v2i64 lsx_i64; + v16u8 lsx_u8; + v8u16 lsx_u16; + v4u32 lsx_u32; + v2u64 lsx_u64; + v4f32 lsx_f32; + v2f64 lsx_f64; #endif } simde__m128i_private; @@ -223,6 +234,17 @@ typedef union { SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + v16i8 lsx_i8; + v8i16 lsx_i16; + v4i32 lsx_i32; + v2i64 lsx_i64; + v16u8 lsx_u8; + v8u16 lsx_u16; + v4u32 lsx_u32; + v2u64 lsx_u64; + v4f32 lsx_f32; + v2f64 lsx_f64; #endif } simde__m128d_private; @@ -248,6 +270,9 @@ typedef union { #else typedef simde__m128d_private simde__m128d; #endif +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + typedef v2i64 simde__m128i; + typedef v2f64 simde__m128d; #elif defined(SIMDE_VECTOR_SUBSCRIPT) typedef int64_t simde__m128i SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; typedef simde_float64 simde__m128d SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; @@ -328,6 +353,17 @@ simde__m128d_to_private(simde__m128d v) { SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64) SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64) #endif +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v16i8, lsx, i8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v8i16, lsx, i16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v4i32, lsx, i32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v2i64, lsx, i64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v16u8, lsx, u8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v8u16, lsx, u16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v4u32, lsx, u32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v2u64, lsx, u64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v4f32, lsx, f32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v2f64, lsx, f64) #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -375,6 +411,17 @@ simde__m128d_to_private(simde__m128d v) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v128_t, wasm, v128); SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v128_t, wasm, v128); +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v16i8, lsx, i8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v8i16, lsx, i16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v4i32, lsx, i32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v2i64, lsx, i64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v16u8, lsx, u8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v8u16, lsx, u16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v4u32, lsx, u32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v2u64, lsx, u64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v4f32, lsx, f32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v2f64, lsx, f64) #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ SIMDE_FUNCTION_ATTRIBUTES @@ -390,6 +437,9 @@ simde_mm_set_pd (simde_float64 e1, simde_float64 e0) { #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; r_.neon_f64 = vld1q_f64(data); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.f64[0] = e0; r_.f64[1] = e1; @@ -416,6 +466,8 @@ simde_mm_set1_pd (simde_float64 a) { r_.neon_f64 = vdupq_n_f64(a); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vldrepl_d(&a, 0); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { @@ -451,6 +503,9 @@ simde_x_mm_abs_pd(simde__m128d a) { r_.altivec_f64 = vec_abs(a_.altivec_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_abs(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF); + r_.lsx_i64 = __lsx_vand_v(__lsx_vldrepl_d(&u64_, 0), a_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -481,6 +536,8 @@ simde_x_mm_not_pd(simde__m128d a) { r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vnor_v(a_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = ~a_.i32f; #else @@ -518,6 +575,8 @@ simde_x_mm_select_pd(simde__m128d a, simde__m128d b, simde__m128d mask) { r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask_.lsx_u64) #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { @@ -546,6 +605,8 @@ simde_mm_add_epi8 (simde__m128i a, simde__m128i b) { r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_add(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vadd_b(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = a_.i8 + b_.i8; #else @@ -579,6 +640,8 @@ simde_mm_add_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_add(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vadd_h(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = a_.i16 + b_.i16; #else @@ -612,6 +675,8 @@ simde_mm_add_epi32 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_add(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vadd_w(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 + b_.i32; #else @@ -643,6 +708,8 @@ simde_mm_add_epi64 (simde__m128i a, simde__m128i b) { r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vadd_d(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_add(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -678,6 +745,8 @@ simde_mm_add_pd (simde__m128d a, simde__m128d b) { r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfadd_d(a_.lsx_f64, b_.lsx_f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.f64 = a_.f64 + b_.f64; #else @@ -715,6 +784,8 @@ simde_mm_move_sd (simde__m128d a, simde__m128d b) { #endif #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(b_.lsx_i64, a_.lsx_i64, 0b00010001); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1); #else @@ -749,6 +820,8 @@ simde_x_mm_broadcastlow_pd(simde__m128d a) { r_.altivec_f64 = vec_splat(a_.altivec_f64, 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_splat(a_.f64[0]); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplvei_d(a_.lsx_i64, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0); #else @@ -776,10 +849,12 @@ simde_mm_add_sd (simde__m128d a, simde__m128d b) { r_, a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - - r_.f64[0] = a_.f64[0] + b_.f64[0]; - r_.f64[1] = a_.f64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfadd_d(b_.lsx_f64, a_.lsx_f64), 0); + #else + r_.f64[0] = a_.f64[0] + b_.f64[0]; + r_.f64[1] = a_.f64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -828,6 +903,8 @@ simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i8x16_add_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsadd_b(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -859,6 +936,8 @@ simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i16x8_add_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsadd_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -890,6 +969,8 @@ simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u8x16_add_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsadd_bu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { @@ -921,6 +1002,8 @@ simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u16x8_add_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsadd_hu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -952,6 +1035,8 @@ simde_mm_and_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f & b_.i32f; #else @@ -985,6 +1070,8 @@ simde_mm_and_si128 (simde__m128i a, simde__m128i b) { r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f & b_.i32f; #else @@ -1020,6 +1107,8 @@ simde_mm_andnot_pd (simde__m128d a, simde__m128d b) { r_.altivec_f64 = vec_andc(b_.altivec_f64, a_.altivec_f64); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32f = vec_andc(b_.altivec_i32f, a_.altivec_i32f); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = ~a_.i32f & b_.i32f; #else @@ -1053,6 +1142,8 @@ simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = ~a_.i32f & b_.i32f; #else @@ -1086,6 +1177,8 @@ simde_mm_xor_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vxor_v(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { @@ -1117,6 +1210,8 @@ simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u8x16_avgr(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vavgr_bu(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) uint16_t wa SIMDE_VECTOR(32); uint16_t wb SIMDE_VECTOR(32); @@ -1156,6 +1251,8 @@ simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u16x8_avgr(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vavgr_hu(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) uint32_t wa SIMDE_VECTOR(32); uint32_t wb SIMDE_VECTOR(32); @@ -1192,6 +1289,8 @@ simde_mm_setzero_si128 (void) { r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_splat(INT32_C(0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplgr2vr_w(0); #elif defined(SIMDE_VECTOR_SUBSCRIPT) r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 }; #else @@ -1243,6 +1342,9 @@ simde_mm_bslli_si128 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) #define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_bslli_si128(a, imm8) \ + (((imm8)<=0) ? (a) : (((imm8)>15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i8((v16i8)__lsx_vbsll_v(simde__m128i_to_private(a).lsx_i64, (imm8))))) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) #define simde_mm_bslli_si128(a, imm8) \ simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8))))) @@ -1338,6 +1440,9 @@ simde_mm_bsrli_si128 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) #define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_bsrli_si128(a, imm8) \ + (((imm8)<=0) ? (a) : (((imm8)>15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i8((v16i8)__lsx_vbsrl_v(simde__m128i_to_private(a).lsx_i64, (imm8))))) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) #define simde_mm_bsrli_si128(a, imm8) \ simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15)))) @@ -1434,6 +1539,8 @@ simde_mm_comieq_sd (simde__m128d a, simde__m128d b) { return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_d(__lsx_vfcmp_ceq_d(b_.lsx_f64, a_.lsx_f64), 0); #else return a_.f64[0] == b_.f64[0]; #endif @@ -1456,6 +1563,8 @@ simde_mm_comige_sd (simde__m128d a, simde__m128d b) { return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_d(__lsx_vfcmp_cle_d(b_.lsx_f64, a_.lsx_f64), 0); #else return a_.f64[0] >= b_.f64[0]; #endif @@ -1478,6 +1587,8 @@ simde_mm_comigt_sd (simde__m128d a, simde__m128d b) { return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_d(__lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64), 0); #else return a_.f64[0] > b_.f64[0]; #endif @@ -1500,6 +1611,8 @@ simde_mm_comile_sd (simde__m128d a, simde__m128d b) { return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_d(__lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64), 0); #else return a_.f64[0] <= b_.f64[0]; #endif @@ -1522,6 +1635,8 @@ simde_mm_comilt_sd (simde__m128d a, simde__m128d b) { return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !!__lsx_vpickve2gr_d(__lsx_vfcmp_clt_d(a_.lsx_f64, b_.lsx_f64), 0); #else return a_.f64[0] < b_.f64[0]; #endif @@ -1544,6 +1659,8 @@ simde_mm_comineq_sd (simde__m128d a, simde__m128d b) { return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return !__lsx_vpickve2gr_d(__lsx_vfcmp_ceq_d(b_.lsx_f64, a_.lsx_f64), 0); #else return a_.f64[0] != b_.f64[0]; #endif @@ -1577,6 +1694,9 @@ simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) { #else r_.altivec_f64 = vec_cpsgn(src_.altivec_f64, dest_.altivec_f64); #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + const v2f64 sign_pos = {-0.0f, -0.0f}; + r_.lsx_i64 = __lsx_vbitsel_v(dest_.lsx_i64, src_.lsx_i64, (v2i64)sign_pos); #elif defined(simde_math_copysign) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -1603,6 +1723,8 @@ simde_mm_castpd_ps (simde__m128d a) { return _mm_castpd_ps(a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vreinterpretq_f32_f64(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128)a; #else simde__m128 r; simde_memcpy(&r, &a, sizeof(a)); @@ -1620,6 +1742,8 @@ simde_mm_castpd_si128 (simde__m128d a) { return _mm_castpd_si128(a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vreinterpretq_s64_f64(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128i)a; #else simde__m128i r; simde_memcpy(&r, &a, sizeof(a)); @@ -1637,6 +1761,8 @@ simde_mm_castps_pd (simde__m128 a) { return _mm_castps_pd(a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vreinterpretq_f64_f32(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128d)a; #else simde__m128d r; simde_memcpy(&r, &a, sizeof(a)); @@ -1654,6 +1780,8 @@ simde_mm_castps_si128 (simde__m128 a) { return _mm_castps_si128(a); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128i)a; #else simde__m128i r; simde_memcpy(&r, &a, sizeof(a)); @@ -1671,6 +1799,8 @@ simde_mm_castsi128_pd (simde__m128i a) { return _mm_castsi128_pd(a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vreinterpretq_f64_s64(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128d)a; #else simde__m128d r; simde_memcpy(&r, &a, sizeof(a)); @@ -1690,6 +1820,8 @@ simde_mm_castsi128_ps (simde__m128i a) { return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return HEDLEY_REINTERPRET_CAST(__m128, a); #else simde__m128 r; simde_memcpy(&r, &a, sizeof(a)); @@ -1717,6 +1849,8 @@ simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vseq_b(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8)); #else @@ -1750,6 +1884,8 @@ simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vseq_h(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = (a_.i16 == b_.i16); #else @@ -1783,6 +1919,8 @@ simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vseq_w(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 == b_.i32); #else @@ -1818,6 +1956,8 @@ simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) { r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64)); #elif defined(SIMDE_MIPS_MSA_NATIVE) r_.msa_i32 = __msa_addv_w(a_.msa_i32, b_.msa_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_ceq_d(a_.lsx_f64, b_.lsx_f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64)); #else @@ -1849,9 +1989,12 @@ simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0; - r_.u64[1] = a_.u64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_ceq_d(a_.lsx_f64, b_.lsx_f64), 0); + #else + r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0; + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -1874,6 +2017,8 @@ simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) { r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(b_.neon_f64, a_.neon_f64))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cune_d(a_.lsx_f64, b_.lsx_f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64)); #else @@ -1904,11 +2049,12 @@ simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) { r_, a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_cune_d(a_.lsx_f64, b_.lsx_f64), 0); + #else + r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -1933,6 +2079,8 @@ simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) { r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_b(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8)); #else @@ -1966,6 +2114,8 @@ simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_h(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16)); #else @@ -1999,6 +2149,8 @@ simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_w(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); #else @@ -2032,6 +2184,8 @@ simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) { r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmplt(a_.altivec_f64, b_.altivec_f64)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_clt_d(a_.lsx_f64, b_.lsx_f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64)); #else @@ -2063,9 +2217,12 @@ simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_clt_d(a_.lsx_f64, b_.lsx_f64), 0); + #else + r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -2092,6 +2249,8 @@ simde_mm_cmple_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2120,10 +2279,12 @@ simde_mm_cmple_sd (simde__m128d a, simde__m128d b) { r_, a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64), 0); + #else + r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -2148,6 +2309,8 @@ simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_b(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8)); #else @@ -2181,6 +2344,8 @@ simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_h(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16)); #else @@ -2214,6 +2379,8 @@ simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslt_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32)); #else @@ -2249,6 +2416,8 @@ simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2277,10 +2446,12 @@ simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) { r_, a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64), 0); + #else + r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -2307,6 +2478,8 @@ simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cle_d(b_.lsx_f64, a_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2336,9 +2509,12 @@ simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_cle_d(b_.lsx_f64, a_.lsx_f64), 0); + #else + r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -2471,6 +2647,9 @@ simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_and(wasm_f64x2_eq(a_.wasm_v128, a_.wasm_v128), wasm_f64x2_eq(b_.wasm_v128, b_.wasm_v128)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vand_v(__lsx_vfcmp_ceq_d(a_.lsx_f64, a_.lsx_f64), + __lsx_vfcmp_ceq_d(b_.lsx_f64, b_.lsx_f64)); #elif defined(simde_math_isnan) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2498,6 +2677,9 @@ simde_mm_cvtsd_f64 (simde__m128d a) { return HEDLEY_STATIC_CAST(simde_float64, vgetq_lane_f64(a_.neon_f64, 0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return HEDLEY_STATIC_CAST(simde_float64, wasm_f64x2_extract_lane(a_.wasm_v128, 0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(a_.lsx_i64, &a_.f64, 0, 0); + return a_.f64[0]; #else return a_.f64[0]; #endif @@ -2522,7 +2704,10 @@ simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(simde_math_isnan) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vand_v(__lsx_vfcmp_ceq_d(a_.lsx_f64, + a_.lsx_f64), __lsx_vfcmp_ceq_d(b_.lsx_f64, b_.lsx_f64)), 0); + #elif defined(simde_math_isnan) r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); r_.u64[1] = a_.u64[1]; #else @@ -2554,6 +2739,9 @@ simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_or(wasm_f64x2_ne(a_.wasm_v128, a_.wasm_v128), wasm_f64x2_ne(b_.wasm_v128, b_.wasm_v128)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vor_v(__lsx_vfcmp_cune_d(a_.lsx_f64, a_.lsx_f64), + __lsx_vfcmp_cune_d(b_.lsx_f64, b_.lsx_f64)); #elif defined(simde_math_isnan) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2585,7 +2773,9 @@ simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(simde_math_isnan) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vor_v(__lsx_vfcmp_cune_d(a_.lsx_f64, a_.lsx_f64), __lsx_vfcmp_cune_d(b_.lsx_f64, b_.lsx_f64)), 0); + #elif defined(simde_math_isnan) r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); r_.u64[1] = a_.u64[1]; #else @@ -2610,6 +2800,8 @@ simde_mm_cvtepi32_pd (simde__m128i a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_convert_low_i32x4(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vffintl_d_w(a_.lsx_i32); #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32); #else @@ -2646,6 +2838,8 @@ simde_mm_cvtepi32_ps (simde__m128i a) { #endif r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0); HEDLEY_DIAGNOSTIC_POP + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vffint_s_w(a_.lsx_i64); #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32); #else @@ -2697,9 +2891,13 @@ simde_mm_cvtpd_epi32 (simde__m128d a) { #else simde__m128i_private r_; - r_.m64[0] = simde_mm_cvtpd_pi32(a); - r_.m64[1] = simde_mm_setzero_si64(); - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) + const v2f64 zero_f64 = {-0.0f, -0.0f}; + r_.lsx_i64 = __lsx_vftintrne_w_d(zero_f64, simde__m128d_to_private(a).lsx_f64); + #else + r_.m64[0] = simde_mm_cvtpd_pi32(a); + r_.m64[1] = simde_mm_setzero_si64(); + #endif return simde__m128i_from_private(r_); #endif } @@ -2722,6 +2920,9 @@ simde_mm_cvtpd_ps (simde__m128d a) { r_.altivec_f32 = vec_float2(a_.altivec_f64, vec_splats(0.0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f32x4_demote_f64x2_zero(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + const v2f64 zero_f64 = {-0.0f, -0.0f}; + r_.lsx_f32 = __lsx_vfcvt_s_d(zero_f64, a_.lsx_f64); #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && HEDLEY_HAS_BUILTIN(__builtin_convertvector) float __attribute__((__vector_size__(8))) z = { 0.0f, 0.0f }; r_.f32 = @@ -2790,6 +2991,9 @@ simde_mm_cvtps_epi32 (simde__m128 a) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) a_ = simde__m128_to_private(a); r_.wasm_v128 = wasm_i32x4_trunc_sat_f32x4(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) + a_ = simde__m128_to_private(a); + r_.lsx_i32 = __lsx_vftintrne_w_s(a_.lsx_f32); #else a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1)); SIMDE_VECTORIZE @@ -2826,6 +3030,8 @@ simde_mm_cvtps_pd (simde__m128 a) { SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfcvtl_d_s(a_.lsx_f32); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2845,6 +3051,9 @@ int32_t simde_mm_cvtsd_si32 (simde__m128d a) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_cvtsd_si32(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) + simde__m128d_private a_ = simde__m128d_to_private(a); + return __lsx_vpickve2gr_w(__lsx_vftintrne_w_d(a_.lsx_f64, a_.lsx_f64), 0); #else simde__m128d_private a_ = simde__m128d_to_private(a); @@ -2872,7 +3081,11 @@ simde_mm_cvtsd_si64 (simde__m128d a) { #endif #else simde__m128d_private a_ = simde__m128d_to_private(a); - return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0])); + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vpickve2gr_d(__lsx_vftintrne_l_d(a_.lsx_f64), 0); + #else + return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0])); + #endif #endif } #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a) @@ -2894,6 +3107,8 @@ simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f32 = vsetq_lane_f32(vcvtxd_f32_f64(vgetq_lane_f64(b_.neon_f64, 0)), a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcvt_s_d(b_.lsx_f64, b_.lsx_f64), 0); #else r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]); @@ -2924,6 +3139,8 @@ simde_x_mm_cvtsi128_si16 (simde__m128i a) { (void) a_; #endif return vec_extract(a_.altivec_i16, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vpickve2gr_h(a_.lsx_i64, 0); #else return a_.i16[0]; #endif @@ -2947,6 +3164,8 @@ simde_mm_cvtsi128_si32 (simde__m128i a) { (void) a_; #endif return vec_extract(a_.altivec_i32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vpickve2gr_w(a_.lsx_i64, 0); #else return a_.i32[0]; #endif @@ -2971,6 +3190,8 @@ simde_mm_cvtsi128_si64 (simde__m128i a) { return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vgetq_lane_s64(a_.neon_i64, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vpickve2gr_d(a_.lsx_i64, 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return HEDLEY_STATIC_CAST(int64_t, wasm_i64x2_extract_lane(a_.wasm_v128, 0)); #endif @@ -2994,6 +3215,9 @@ simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde_float64 b_float64 = (simde_float64)b; + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vldrepl_d(&(b_float64), 0), 0); #else r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b); r_.i64[1] = a_.i64[1]; @@ -3015,6 +3239,8 @@ simde_x_mm_cvtsi16_si128 (int16_t a) { r_.neon_i16 = vsetq_lane_s16(a, vdupq_n_s16(0), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_make(a, 0, 0, 0, 0, 0, 0, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vinsgr2vr_h(__lsx_vreplgr2vr_h(0), a, 0); #else r_.i16[0] = a; r_.i16[1] = 0; @@ -3041,6 +3267,8 @@ simde_mm_cvtsi32_si128 (int32_t a) { r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_make(a, 0, 0, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vinsgr2vr_w(__lsx_vreplgr2vr_w(0), a, 0); #else r_.i32[0] = a; r_.i32[1] = 0; @@ -3071,6 +3299,9 @@ simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde_float64 b_float64 = (simde_float64)b; + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vldrepl_d(&(b_float64), 0), 0); #else r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b); r_.f64[1] = a_.f64[1]; @@ -3101,6 +3332,8 @@ simde_mm_cvtsi64_si128 (int64_t a) { r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_make(a, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(0), a, 0); #else r_.i64[0] = a; r_.i64[1] = 0; @@ -3128,8 +3361,11 @@ simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) { a_ = simde__m128d_to_private(a); simde__m128_private b_ = simde__m128_to_private(b); - a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]); - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcvtl_d_s(b_.lsx_f32), 0); + #else + a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]); + #endif return simde__m128d_from_private(a_); #endif } @@ -3175,9 +3411,13 @@ simde_mm_cvttpd_epi32 (simde__m128d a) { #else simde__m128i_private r_; + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) + const v2f64 zero_f64 = {-0.0f, -0.0f}; + r_.lsx_i64 = __lsx_vftintrz_w_d(zero_i64, simde__m128d_to_private(a).lsx_f64); + #else r_.m64[0] = simde_mm_cvttpd_pi32(a); r_.m64[1] = simde_mm_setzero_si64(); - + #endif return simde__m128i_from_private(r_); #endif } @@ -3232,6 +3472,25 @@ simde_mm_cvttps_epi32 (simde__m128 a) { r_.wasm_v128 = wasm_v128_bitselect(r_.wasm_v128, wasm_i32x4_splat(INT32_MIN), valid_input); #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vftintrz_w_s(a_.lsx_f32); + #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) + #if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) + simde_float32 f1 = 2147483648.0f; + __m128i valid_input = + __lsx_vand_v( + __lsx_vfcmp_clt_s(a_.lsx_f32, (__m128)__lsx_vldrepl_w(&f1, 0)), + __lsx_vfcmp_ceq_s(a_.lsx_f32, a_.lsx_f32) + ); + #elif !defined(SIMDE_FAST_CONVERSION_RANGE) + simde_float32 f1 = 2147483648.0f; + __m128i valid_input = __lsx_vfcmp_clt_s(a_.lsx_f32, (__m128)__lsx_vldrepl_w(&f1, 0)); + #elif !defined(SIMDE_FAST_NANS) + __m128i valid_input = __lsx_vfcmp_ceq_s(a_.lsx_f32, a_.lsx_f32); + #endif + + r_.lsx_i32 = __lsx_vbitsel_v(__lsx_vreplgr2vr_w(INT32_MIN), temp, valid_input); + #endif #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_ARCH_POWER) SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); @@ -3275,6 +3534,9 @@ int32_t simde_mm_cvttsd_si32 (simde__m128d a) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_cvttsd_si32(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) + simde__m128d_private a_ = simde__m128d_to_private(a); + return __lsx_vpickve2gr_w(__lsx_vftintrz_w_d(a_.lsx_f64, a_.lsx_f64), 0); #else simde__m128d_private a_ = simde__m128d_to_private(a); simde_float64 v = a_.f64[0]; @@ -3299,6 +3561,9 @@ simde_mm_cvttsd_si64 (simde__m128d a) { #else return _mm_cvttsd_si64x(a); #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128d_private a_ = simde__m128d_to_private(a); + return __lsx_vpickve2gr_d(__lsx_vftintrz_l_d(a_.lsx_f64), 0); #else simde__m128d_private a_ = simde__m128d_to_private(a); return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]); @@ -3327,6 +3592,8 @@ simde_mm_div_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfdiv_d(b_.lsx_f64, a_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -3359,6 +3626,9 @@ simde_mm_div_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) float64x2_t temp = vdivq_f64(a_.neon_f64, b_.neon_f64); r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128d temp = __lsx_vfdiv_d(a_.lsx_f64, b_.lsx_f64); + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)temp, 0); #else r_.f64[0] = a_.f64[0] / b_.f64[0]; r_.f64[1] = a_.f64[1]; @@ -3396,6 +3666,8 @@ simde_mm_extract_epi16 (simde__m128i a, const int imm8) #define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff))) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_extract_epi16(a, imm8) HEDLEY_STATIC_CAST(int32_t, wasm_u16x8_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 7)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_extract_epi16(a, imm8) HEDLEY_STATIC_CAST(int32_t, __lsx_vpickve2gr_hu(simde__m128i_to_private(a).lsx_i64, imm8)) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8) @@ -3415,6 +3687,8 @@ simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8) #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8))) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_insert_epi16(a, i, imm8) wasm_i16x8_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 7, (i)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_lsx_i16((v8i16)__lsx_vinsgr2vr_h(simde__m128i_to_private(a).lsx_i64, i, imm8)) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8) @@ -3434,6 +3708,8 @@ simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_load(mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), sizeof(r_)); #endif @@ -3454,6 +3730,8 @@ simde_mm_load1_pd (simde_float64 const* mem_addr) { return simde__m128d_from_neon_f64(vld1q_dup_f64(mem_addr)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return simde__m128d_from_wasm_v128(wasm_v128_load64_splat(mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128d)__lsx_vldrepl_d(mem_addr, 0); #else return simde_mm_set1_pd(*mem_addr); #endif @@ -3476,6 +3754,8 @@ simde_mm_load_sd (simde_float64 const* mem_addr) { r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_load64_zero(HEDLEY_REINTERPRET_CAST(const void*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(__lsx_vreplgr2vr_d(0), __lsx_vldrepl_d(mem_addr, 0), 0); #else r_.f64[0] = *mem_addr; r_.u64[1] = UINT64_C(0); @@ -3495,6 +3775,8 @@ simde_mm_load_si128 (simde__m128i const* mem_addr) { return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128i)__lsx_vld(mem_addr, 0); #else simde__m128i_private r_; @@ -3525,6 +3807,8 @@ simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) { r_.neon_f64 = vcombine_f64(vget_low_f64(a_.neon_f64), vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_load64_lane(HEDLEY_REINTERPRET_CAST(const void*, mem_addr), a_.wasm_v128, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_d(__lsx_vldrepl_d(mem_addr, 0), a_.lsx_i64); #else simde_float64 t; @@ -3550,6 +3834,8 @@ simde_mm_loadl_epi64 (simde__m128i const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vinsgr2vr_d(__lsx_vldrepl_d(mem_addr, 0), 0, 1); #else int64_t value; simde_memcpy(&value, mem_addr, sizeof(value)); @@ -3579,6 +3865,8 @@ simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) { HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)), vget_high_f64(a_.neon_f64)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_load64_lane(HEDLEY_REINTERPRET_CAST(const void*, mem_addr), a_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_d(a_.lsx_i64, __lsx_vldrepl_d(mem_addr, 0)); #else r_.f64[0] = *mem_addr; r_.u64[1] = a_.u64[1]; @@ -3609,6 +3897,9 @@ simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t tmp = wasm_v128_load(mem_addr); r_.wasm_v128 = wasm_i64x2_shuffle(tmp, tmp, 1, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vld(mem_addr, 0); + r_.lsx_i64 = __lsx_vshuf4i_d(temp, temp, 0b0001); #else r_.f64[0] = mem_addr[1]; r_.f64[1] = mem_addr[0]; @@ -3628,6 +3919,8 @@ simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { return _mm_loadu_pd(mem_addr); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld1q_f64(mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128d)__lsx_vld(mem_addr, 0); #else simde__m128d_private r_; @@ -3655,6 +3948,8 @@ simde_mm_loadu_epi8(void const * mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -3684,6 +3979,8 @@ simde_mm_loadu_epi16(void const * mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vreinterpretq_s16_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -3712,6 +4009,8 @@ simde_mm_loadu_epi32(void const * mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i32 = vreinterpretq_s32_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr), 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -3741,6 +4040,8 @@ simde_mm_loadu_epi64(void const * mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vreinterpretq_s64_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -3773,6 +4074,8 @@ simde_mm_loadu_si128 (void const* mem_addr) { HEDLEY_DIAGNOSTIC_POP #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -3819,6 +4122,9 @@ simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) { r_.i32 = __builtin_shufflevector(p32, p32, 0, 2, 4, 6) + __builtin_shufflevector(p32, p32, 1, 3, 5, 7); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_ev = __lsx_vmulwev_w_h(a_.lsx_i64, b_.lsx_i64); + r_.lsx_i64 = __lsx_vmaddwod_w_h(temp_ev, a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) { @@ -3843,11 +4149,17 @@ simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HED a_ = simde__m128i_to_private(a), mask_ = simde__m128i_to_private(mask); - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - if (mask_.u8[i] & 0x80) { - mem_addr[i] = a_.i8[i]; + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vld(mem_addr, 0); + __m128i temp1 = __lsx_vbitsel_v(temp, a_.lsx_i64, __lsx_vslti_b(mask_.lsx_i64, 0)); + __lsx_vst(temp1, mem_addr, 0); + #else + for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { + if (mask_.u8[i] & 0x80) { + mem_addr[i] = a_.i8[i]; + } } - } + #endif #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -3896,6 +4208,8 @@ simde_mm_movemask_epi8 (simde__m128i a) { r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r = HEDLEY_STATIC_CAST(int32_t, wasm_i8x16_bitmask(a_.wasm_v128)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = __lsx_vpickve2gr_w(__lsx_vmskltz_b(a_.lsx_i64), 0); #else SIMDE_VECTORIZE_REDUCTION(|:r) for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { @@ -3937,6 +4251,8 @@ simde_mm_movemask_pd (simde__m128d a) { r = HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r = HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_bitmask(a_.wasm_v128)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = __lsx_vpickve2gr_w(__lsx_vmskltz_d(a_.lsx_i64), 0); #else SIMDE_VECTORIZE_REDUCTION(|:r) for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { @@ -3962,6 +4278,8 @@ simde_mm_movepi64_pi64 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_i64 = vget_low_s64(a_.neon_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.i64[0] = __lsx_vpickve2gr_d(a_.lsx_i64, 0); #else r_.i64[0] = a_.i64[0]; #endif @@ -3984,6 +4302,8 @@ simde_mm_movpi64_epi64 (simde__m64 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vcombine_s64(a_.neon_i64, vdup_n_s64(0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(0), a_.i64[0], 0); #else r_.i64[0] = a_.i64[0]; r_.i64[1] = 0; @@ -4013,6 +4333,8 @@ simde_mm_min_epi16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i16x8_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4044,6 +4366,8 @@ simde_mm_min_epu8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u8x16_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmin_bu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { @@ -4075,6 +4399,8 @@ simde_mm_min_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfmin_d(a_.lsx_f64, b_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -4107,6 +4433,8 @@ simde_mm_min_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) float64x2_t temp = vminq_f64(a_.neon_f64, b_.neon_f64); r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfmin_d(a_.lsx_f64, b_.lsx_f64), 0); #else r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0]; r_.f64[1] = a_.f64[1]; @@ -4136,6 +4464,8 @@ simde_mm_max_epi16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_i16x8_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4167,6 +4497,8 @@ simde_mm_max_epu8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u8x16_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmax_bu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { @@ -4198,6 +4530,8 @@ simde_mm_max_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfmax_d(a_.lsx_f64, b_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -4230,6 +4564,8 @@ simde_mm_max_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) float64x2_t temp = vmaxq_f64(a_.neon_f64, b_.neon_f64); r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfmax_d(a_.lsx_f64, b_.lsx_f64), 0); #else r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0]; r_.f64[1] = a_.f64[1]; @@ -4256,6 +4592,8 @@ simde_mm_move_epi64 (simde__m128i a) { r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, wasm_i64x2_const(0, 0), 0, 2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_d(__lsx_vreplgr2vr_d(0), a_.lsx_i64); #else r_.i64[0] = a_.i64[0]; r_.i64[1] = 0; @@ -4287,6 +4625,8 @@ simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u64x2_extmul_low_u32x4( wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 2, 0, 2), wasm_i32x4_shuffle(b_.wasm_v128, b_.wasm_v128, 0, 2, 0, 2)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmulwev_d_wu(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(a_.u32) z = { 0, }; a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6); @@ -4317,6 +4657,8 @@ simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_mul(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmul_d(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = a_.i64 * b_.i64; #else @@ -4337,7 +4679,9 @@ simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) { a_ = simde__m128i_to_private(a), b_ = simde__m128i_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmod_d(a_.lsx_i64, b_.lsx_i64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) r_.i64 = a_.i64 % b_.i64; #else SIMDE_VECTORIZE @@ -4366,6 +4710,8 @@ simde_mm_mul_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfmul_d(a_.lsx_f64, b_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -4398,6 +4744,8 @@ simde_mm_mul_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) float64x2_t temp = vmulq_f64(a_.neon_f64, b_.neon_f64); r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfmul_d(a_.lsx_f64, b_.lsx_f64), 0); #else r_.f64[0] = a_.f64[0] * b_.f64[0]; r_.f64[1] = a_.f64[1]; @@ -4463,6 +4811,8 @@ simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) { const v128_t lo = wasm_i32x4_extmul_low_i16x8(a_.wasm_v128, b_.wasm_v128); const v128_t hi = wasm_i32x4_extmul_high_i16x8(a_.wasm_v128, b_.wasm_v128); r_.wasm_v128 = wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmuh_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4506,6 +4856,8 @@ simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) { const v128_t lo = wasm_u32x4_extmul_low_u16x8(a_.wasm_v128, b_.wasm_v128); const v128_t hi = wasm_u32x4_extmul_high_u16x8(a_.wasm_v128, b_.wasm_v128); r_.wasm_v128 = wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmuh_hu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -4539,6 +4891,8 @@ simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_mul(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vmul_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4570,6 +4924,8 @@ simde_mm_or_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vor_v(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { @@ -4601,6 +4957,8 @@ simde_mm_or_si128 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vor_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f | b_.i32f; #else @@ -4636,6 +4994,8 @@ simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_i8 = vec_packs(a_.altivec_i16, b_.altivec_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_narrow_i16x8(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssrarni_b_h(b_.lsx_i64, a_.lsx_i64, 0); #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) int16_t SIMDE_VECTOR(32) v = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const int16_t SIMDE_VECTOR(32) min = { INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN }; @@ -4685,6 +5045,8 @@ simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) { r_.sse_m128i = _mm_packs_epi32(a_.sse_m128i, b_.sse_m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_narrow_i32x4(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssrarni_h_w(b_.lsx_i64, a_.lsx_i64, 0); #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) int32_t SIMDE_VECTOR(32) v = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 1, 2, 3, 4, 5, 6, 7); const int32_t SIMDE_VECTOR(32) min = { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }; @@ -4740,6 +5102,8 @@ simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u8x16_narrow_i16x8(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssrarni_bu_h(b_.lsx_i64, a_.lsx_i64, 0); #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) int16_t v SIMDE_VECTOR(32) = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); @@ -4783,6 +5147,8 @@ simde_mm_pause (void) { __asm__ __volatile__ ("or 27,27,27" ::: "memory"); #elif defined(SIMDE_ARCH_WASM) __asm__ __volatile__ ("nop"); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __asm__ __volatile ("dbar 0"); #elif defined(HEDLEY_GCC_VERSION) #if defined(SIMDE_ARCH_RISCV) __builtin_riscv_pause(); @@ -4819,6 +5185,11 @@ simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) { tmp = wasm_i16x8_add(tmp, wasm_i32x4_shl(tmp, 16)); tmp = wasm_i16x8_add(tmp, wasm_i64x2_shl(tmp, 32)); r_.wasm_v128 = wasm_u64x2_shr(tmp, 48); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vabsd_bu(a_.lsx_i64, b_.lsx_i64); + temp = __lsx_vhaddw_hu_bu(temp, temp); + temp = __lsx_vhaddw_wu_hu(temp, temp); + r_.lsx_i64 = __lsx_vhaddw_du_wu(temp, temp); #else for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { uint16_t tmp = 0; @@ -4863,6 +5234,13 @@ simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12, e8, e9, e10, e11, e12, e13, e14, e15}; r_.neon_i8 = vld1q_s8(data); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v16i8) int8_t data[16] = { + e0, e1, e2, e3, + e4, e5, e6, e7, + e8, e9, e10, e11, + e12, e13, e14, e15}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.i8[ 0] = e0; r_.i8[ 1] = e1; @@ -4903,6 +5281,9 @@ simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4, r_.neon_i16 = vld1q_s16(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v8i16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.i16[0] = e0; r_.i16[1] = e1; @@ -4929,6 +5310,8 @@ simde_mm_loadu_si16 (void const* mem_addr) { HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ HEDLEY_GCC_VERSION_CHECK(12,1,0)) return _mm_loadu_si16(mem_addr); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + return __lsx_vld(mem_addr, 0); #else int16_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -4952,6 +5335,9 @@ simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) { r_.neon_i32 = vld1q_s32(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v4i32) int32_t data[4] = {e0, e1, e2, e3}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.i32[0] = e0; r_.i32[1] = e1; @@ -4980,6 +5366,8 @@ simde_mm_loadu_si32 (void const* mem_addr) { simde__m128i_private r_; r_.neon_i32 = vsetq_lane_s32(* HEDLEY_REINTERPRET_CAST(const int32_t *, mem_addr), vdupq_n_s32(0), 0); return simde__m128i_from_private(r_); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + return __lsx_vld(mem_addr, 0); #else int32_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -5000,6 +5388,9 @@ simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1)); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_TO_16 simde__m64 data[2] = {e0, e1}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.m64[0] = e0; r_.m64[1] = e1; @@ -5025,6 +5416,9 @@ simde_mm_set_epi64x (int64_t e1, int64_t e0) { r_.neon_i64 = vld1q_s64(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_make(e0, e1); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v2i64) int64_t data[2] = {e0, e1}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.i64[0] = e0; r_.i64[1] = e1; @@ -5045,6 +5439,8 @@ simde_mm_loadu_si64 (void const* mem_addr) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) return _mm_loadu_si64(mem_addr); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + return __lsx_vld(mem_addr, 0); #else int64_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -5079,6 +5475,13 @@ simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, r_.neon_u8 = vld1q_u8(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v16u8) uint8_t data[16] = { + e0, e1, e2, e3, + e4, e5, e6, e7, + e8, e9, e10, e11, + e12, e13, e14, e15}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.u8[ 0] = e0; r_.u8[ 1] = e1; r_.u8[ 2] = e2; r_.u8[ 3] = e3; r_.u8[ 4] = e4; r_.u8[ 5] = e5; r_.u8[ 6] = e6; r_.u8[ 7] = e7; @@ -5106,6 +5509,9 @@ simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, r_.neon_u16 = vld1q_u16(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v8u16) uint16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3; r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7; @@ -5129,6 +5535,9 @@ simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { r_.neon_u32 = vld1q_u32(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u32x4_make(e0, e1, e2, e3); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v4u32) uint32_t data[4] = {e0, e1, e2, e3}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.u32[0] = e0; r_.u32[1] = e1; @@ -5153,6 +5562,9 @@ simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) { r_.neon_u64 = vld1q_u64(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u64x2_make(e0, e1); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + SIMDE_ALIGN_LIKE_16(v2u64) uint64_t data[2] = {e0, e1}; + r_.lsx_i64 = __lsx_vld(data, 0); #else r_.u64[0] = e0; r_.u64[1] = e1; @@ -5171,6 +5583,8 @@ simde_mm_set_sd (simde_float64 a) { return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return simde__m128d_from_wasm_v128(wasm_f64x2_make(a, 0)); + #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + return (__m128d)__lsx_vinsgr2vr_d(__lsx_vldrepl_d(&a, 0), 0, 1); #else return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a); #endif @@ -5193,6 +5607,8 @@ simde_mm_set1_epi8 (int8_t a) { r_.wasm_v128 = wasm_i8x16_splat(a); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplgr2vr_b(a); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -5221,6 +5637,8 @@ simde_mm_set1_epi16 (int16_t a) { r_.wasm_v128 = wasm_i16x8_splat(a); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplgr2vr_h(a); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -5249,6 +5667,8 @@ simde_mm_set1_epi32 (int32_t a) { r_.wasm_v128 = wasm_i32x4_splat(a); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplgr2vr_w(a); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -5277,6 +5697,8 @@ simde_mm_set1_epi64x (int64_t a) { r_.wasm_v128 = wasm_i64x2_splat(a); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplgr2vr_d(a); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { @@ -5510,8 +5932,11 @@ simde_mm_shuffle_epi32 (simde__m128i a, const int imm8) return simde__m128i_from_private(r_); } + #if defined(SIMDE_X86_SSE2_NATIVE) #define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_shuffle_epi32(a, imm8) (__lsx_vshuf4i_w(simde__m128i_to_private(a).lsx_i64, (imm8))) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \ const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ @@ -5566,6 +5991,21 @@ simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) #define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_shuffle_pd(a, b, imm8) \ + ({ \ + simde__m128d res; \ + if ((imm8) & 0x01) { \ + res = (simde__m128d)__lsx_vshuf4i_d(simde__m128d_to_private(a).lsx_i64, simde__m128d_to_private(b).lsx_i64, 0b1001); \ + } else if ((imm8) & 0x02) { \ + res = (simde__m128d)__lsx_vshuf4i_d(simde__m128d_to_private(a).lsx_i64, simde__m128d_to_private(b).lsx_i64, 0b1100); \ + } else if ((imm8) & 0x03) { \ + res = (simde__m128d)__lsx_vshuf4i_d(simde__m128d_to_private(a).lsx_i64, simde__m128d_to_private(b).lsx_i64, 0b1101); \ + } else { \ + res = (simde__m128d)__lsx_vshuf4i_d(simde__m128d_to_private(a).lsx_i64, simde__m128d_to_private(b).lsx_i64, 0b1000); \ + } \ + res; \ + }) #elif defined(SIMDE_SHUFFLE_VECTOR_) #define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \ simde__m128d_from_private((simde__m128d_private) { .f64 = \ @@ -5599,6 +6039,9 @@ simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) #define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_shufflehi_epi16(a, imm8) \ + ((simde__m128i)__lsx_vextrins_d(__lsx_vshuf4i_h(simde__m128i_to_private(a).lsx_i64, imm8), simde__m128i_to_private(a).lsx_i64, 0x00)) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm_shufflehi_epi16(a, imm8) \ (__extension__ ({ \ @@ -5659,6 +6102,9 @@ simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) #define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_shufflelo_epi16(a, imm8) \ + ((simde__m128i)__lsx_vextrins_d(__lsx_vshuf4i_h(simde__m128i_to_private(a).lsx_i64, imm8), simde__m128i_to_private(a).lsx_i64, 0b00010001)) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_shufflelo_epi16(a, imm8) \ simde__m128i_from_wasm_v128( \ @@ -5716,6 +6162,8 @@ simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) { r_.u16 = (a_.u16 << count_.u64[0]); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0]))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslli_h(a_.lsx_i64, count_.u64[0]); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0)); #else @@ -5750,6 +6198,8 @@ simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) { r_.u32 = (a_.u32 << count_.u64[0]); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0]))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vslli_w(a_.lsx_i64, count_.u64[0]); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0)); #else @@ -5785,6 +6235,8 @@ simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) { r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, s)) : wasm_i64x2_const(0,0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_d(a_.lsx_i64, __lsx_vreplgr2vr_d(HEDLEY_STATIC_CAST(int64_t, s))); #else #if !defined(SIMDE_BUG_GCC_94488) SIMDE_VECTORIZE @@ -5817,6 +6269,8 @@ simde_mm_sqrt_pd (simde__m128d a) { r_.wasm_v128 = wasm_f64x2_sqrt(a_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = vec_sqrt(a_.altivec_f64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfsqrt_d(a_.lsx_f64); #elif defined(simde_math_sqrt) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -5848,7 +6302,9 @@ simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(simde_math_sqrt) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfsqrt_d(b_.lsx_f64), 0); + #elif defined(simde_math_sqrt) r_.f64[0] = simde_math_sqrt(b_.f64[0]); r_.f64[1] = a_.f64[1]; #else @@ -5877,6 +6333,8 @@ simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_h(a_.lsx_i64, __lsx_vreplgr2vr_h(cnt)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -5908,6 +6366,8 @@ simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) { r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_w(a_.lsx_i64, __lsx_vreplgr2vr_w(cnt)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -5939,6 +6399,8 @@ simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) { r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u64x2_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_d(a_.lsx_i64, __lsx_vreplgr2vr_d(cnt)); #else #if !defined(SIMDE_BUG_GCC_94488) SIMDE_VECTORIZE @@ -5970,6 +6432,8 @@ simde_mm_srai_epi16 (simde__m128i a, const int imm8) r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsra_h(a_.lsx_i64, __lsx_vreplgr2vr_h(cnt)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) { @@ -6001,6 +6465,8 @@ simde_mm_srai_epi32 (simde__m128i a, const int imm8) r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsra_w(a_.lsx_i64, __lsx_vreplgr2vr_w(cnt)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) { @@ -6034,6 +6500,8 @@ simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) { r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsra_h(a_.lsx_i64, __lsx_vreplgr2vr_h(cnt)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -6065,6 +6533,8 @@ simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) { r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsra_w(a_.lsx_i64, __lsx_vreplgr2vr_w(cnt)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -6119,6 +6589,8 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8) #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) #define simde_mm_slli_epi16(a, imm8) \ ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_slli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8) @@ -6174,6 +6646,8 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8) } \ ret; \ })) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_slli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8) @@ -6214,6 +6688,8 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_slli_epi64(a, imm8) \ ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_slli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8) @@ -6257,6 +6733,8 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8) #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) #define simde_mm_srli_epi16(a, imm8) \ ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_srli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8) @@ -6312,6 +6790,8 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8) } \ ret; \ })) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_srli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8) @@ -6356,6 +6836,8 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_srli_epi64(a, imm8) \ ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #define simde_mm_srli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8) @@ -6370,6 +6852,8 @@ simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(simde__m128d_to_private(a).lsx_i64, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a)); #endif @@ -6388,6 +6872,8 @@ simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst1q_f64(mem_addr, vdupq_laneq_f64(a_.neon_f64, 0)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(__lsx_vilvl_d(a_.lsx_i64, a_.lsx_i64), mem_addr, 0); #else mem_addr[0] = a_.f64[0]; mem_addr[1] = a_.f64[0]; @@ -6416,6 +6902,8 @@ simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) { simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), &v, sizeof(v)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(a_.lsx_i64, mem_addr, 0, 0); #else simde_float64 v = a_.f64[0]; simde_memcpy(mem_addr, &v, sizeof(simde_float64)); @@ -6436,6 +6924,8 @@ simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(a_.lsx_i64, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), &a_, sizeof(a_)); #endif @@ -6457,6 +6947,8 @@ void *mem_addr = vgetq_lane_f64(a_.neon_f64, 1); #elif defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 1); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(a_.lsx_i64, mem_addr, 0, 1); #else *mem_addr = a_.f64[1]; #endif @@ -6471,6 +6963,8 @@ void simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(simde__m128i_to_private(a).lsx_i64, mem_addr, 0, 0); #else simde__m128i_private a_ = simde__m128i_to_private(a); int64_t tmp; @@ -6503,6 +6997,8 @@ simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) { _mm_storel_pd(mem_addr, a); #elif defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), simde__m128d_to_wasm_v128(a), 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(simde__m128d_to_private(a).lsx_f64, mem_addr, 0, 0); #else simde__m128d_private a_ = simde__m128d_to_private(a); @@ -6535,6 +7031,9 @@ simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) { #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0); simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp = __lsx_vshuf4i_d(a_.lsx_i64, a_.lsx_i64, 0b0001); + __lsx_vst(temp, mem_addr, 0); #else mem_addr[0] = a_.f64[1]; mem_addr[1] = a_.f64[0]; @@ -6552,6 +7051,8 @@ simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) { _mm_storeu_pd(mem_addr, a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(simde__m128d_to_private(a).lsx_f64, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); #endif @@ -6565,6 +7066,8 @@ void simde_mm_storeu_si128 (void* mem_addr, simde__m128i a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(simde__m128i_to_private(a).lsx_i64, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); #endif @@ -6581,6 +7084,8 @@ simde_mm_storeu_si16 (void* mem_addr, simde__m128i a) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) _mm_storeu_si16(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_h(simde__m128i_to_private(a).lsx_i64, mem_addr, 0, 0); #else int16_t val = simde_x_mm_cvtsi128_si16(a); simde_memcpy(mem_addr, &val, sizeof(val)); @@ -6600,6 +7105,8 @@ simde_mm_storeu_si32 (void* mem_addr, simde__m128i a) { _mm_storeu_si32(mem_addr, a); #elif defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store32_lane(mem_addr, simde__m128i_to_wasm_v128(a), 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_w(simde__m128i_to_private(a).lsx_i64, mem_addr, 0, 0); #else int32_t val = simde_mm_cvtsi128_si32(a); simde_memcpy(mem_addr, &val, sizeof(val)); @@ -6617,6 +7124,8 @@ simde_mm_storeu_si64 (void* mem_addr, simde__m128i a) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) _mm_storeu_si64(mem_addr, a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_d(simde__m128i_to_private(a).lsx_i64, mem_addr, 0, 0); #else int64_t val = simde_mm_cvtsi128_si64(a); simde_memcpy(mem_addr, &val, sizeof(val)); @@ -6634,7 +7143,7 @@ simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A64V8_NATIVE) || \ defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else simde_mm_store_pd(mem_addr, a); @@ -6652,7 +7161,7 @@ simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) { #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \ defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else simde_mm_store_si128(mem_addr, a); @@ -6671,6 +7180,8 @@ simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) { __builtin_nontemporal_store(a, mem_addr); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_lane_s32(mem_addr, vdupq_n_s32(a), 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vstelm_w(__lsx_vreplgr2vr_w(a), mem_addr, 0, 0); #else *mem_addr = a; #endif @@ -6688,6 +7199,8 @@ simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) { __builtin_nontemporal_store(a, mem_addr); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1_s64(mem_addr, vdup_n_s64(a)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return __lsx_vstelm_d(__lsx_vreplgr2vr_d(a), mem_addr, 0, 0); #else *mem_addr = a; #endif @@ -6713,6 +7226,8 @@ simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) { r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_sub(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsub_b(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = a_.i8 - b_.i8; #else @@ -6744,6 +7259,8 @@ simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) { r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_sub(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsub_h(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = a_.i16 - b_.i16; #else @@ -6775,6 +7292,8 @@ simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) { r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_sub(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsub_w(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 - b_.i32; #else @@ -6806,6 +7325,8 @@ simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) { r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_sub(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsub_d(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = a_.i64 - b_.i64; #else @@ -6834,6 +7355,8 @@ simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) { r_.u32 = a_.u32 - b_.u32; #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsub_w(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -6861,6 +7384,8 @@ simde_mm_sub_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfsub_d(a_.lsx_f64, b_.lsx_f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -6889,10 +7414,12 @@ simde_mm_sub_sd (simde__m128d a, simde__m128d b) { r_, a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - - r_.f64[0] = a_.f64[0] - b_.f64[0]; - r_.f64[1] = a_.f64[1]; - + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfsub_d(a_.lsx_f64, b_.lsx_f64), 0); + #else + r_.f64[0] = a_.f64[0] - b_.f64[0]; + r_.f64[1] = a_.f64[1]; + #endif return simde__m128d_from_private(r_); #endif } @@ -6941,6 +7468,8 @@ simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) { r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_sub_sat(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssub_b(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) { @@ -6970,6 +7499,8 @@ simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) { r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_sub_sat(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssub_h(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) { @@ -7001,6 +7532,8 @@ simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u8x16_sub_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssub_bu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.u8[0])) ; i++) { @@ -7032,6 +7565,8 @@ simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) { r_.wasm_v128 = wasm_u16x8_sub_sat(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vssub_hu(a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.u16[0])) ; i++) { @@ -7065,6 +7600,8 @@ simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_eq_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_ceq_d(a_.lsx_f64, b_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -7101,6 +7638,8 @@ simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_ge_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_cle_d(b_.lsx_f64, a_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -7137,6 +7676,8 @@ simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_gt_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -7173,6 +7714,8 @@ simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_le_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -7209,6 +7752,8 @@ simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_lt_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_clt_d(a_.lsx_f64, b_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -7245,6 +7790,8 @@ simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) { r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_neq_b), 0) != 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r = !!__lsx_vpickve2gr_d(__lsx_vfcmp_cune_d(a_.lsx_f64, b_.lsx_f64), 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -7308,6 +7855,8 @@ simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) { r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_shuffle(a_.wasm_v128, b_.wasm_v128, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_b(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); #else @@ -7345,6 +7894,8 @@ simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) { r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 4, 12, 5, 13, 6, 14, 7, 15); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_h(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15); #else @@ -7382,6 +7933,8 @@ simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) { r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 6, 3, 7); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7); #else @@ -7416,6 +7969,8 @@ simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) { r_.neon_i64 = vcombine_s64(a_h, b_h); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3); #else @@ -7450,6 +8005,8 @@ simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) { r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_d(b_.lsx_i64, a_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) { @@ -7485,6 +8042,8 @@ simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) { r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_b(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); #else @@ -7522,6 +8081,8 @@ simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) { r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 8, 1, 9, 2, 10, 3, 11); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_h(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11); #else @@ -7559,6 +8120,8 @@ simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) { r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 4, 1, 5); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_w(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5); #else @@ -7593,6 +8156,8 @@ simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) { r_.neon_i64 = vcombine_s64(a_l, b_l); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2); #else @@ -7625,6 +8190,8 @@ simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) { r_.neon_f64 = vzip1q_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_d(b_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); #else @@ -7659,6 +8226,8 @@ simde_x_mm_negate_pd(simde__m128d a) { r_.neon_f64 = vnegq_f64(a_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_neg(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vneg_d(a_.lsx_i64); #elif defined(SIMDE_VECTOR_NEGATE) r_.f64 = -a_.f64; #else @@ -7689,6 +8258,8 @@ simde_mm_xor_si128 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_xor(b_.wasm_v128, a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vxor_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f ^ b_.i32f; #else @@ -7721,6 +8292,8 @@ simde_x_mm_not_si128 (simde__m128i a) { r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vnor_v(a_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = ~a_.i32f; #else