diff --git a/simde/x86/avx2.h b/simde/x86/avx2.h index ad939e90a..abc5ecce2 100644 --- a/simde/x86/avx2.h +++ b/simde/x86/avx2.h @@ -4564,6 +4564,10 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8) the expected range, hence the discrepancy between what we allow and what Intel specifies. Some compilers will return 0, others seem to just mask off everything outside of the range. */ + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsll_h(a, __lasx_xvreplgr2vr_h(imm8)); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -4586,8 +4590,6 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_slli_epi16(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_h(0) : __lasx_xvslli_h(a, imm8 & 15)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -4603,6 +4605,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_slli_epi32 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsll_w(a, __lasx_xvreplgr2vr_w(imm8)); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -4625,8 +4631,6 @@ simde_mm256_slli_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_slli_epi32(a, imm8) (imm8 > 31 ? __lasx_xvreplgr2vr_w(0) : __lasx_xvslli_w(a, imm8 & 31)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi32(a, imm8) \ simde_mm256_set_m128i( \ @@ -4642,6 +4646,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_slli_epi64 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsll_d(a, __lasx_xvreplgr2vr_d(imm8)); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -4659,8 +4667,6 @@ simde_mm256_slli_epi64 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_slli_epi64(a, imm8) (imm8 > 63 ? __lasx_xvreplgr2vr_d(0) : __lasx_xvslli_d(a, imm8)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi64(a, imm8) \ simde_mm256_set_m128i( \ @@ -4927,6 +4933,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_srai_epi16 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsra_h(a, __lasx_xvreplgr2vr_h((imm8 > 15 ? 15 : imm8))); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -4947,8 +4957,6 @@ simde_mm256_srai_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srai_epi16(a, imm8) __lasx_xvsrai_h(a, (imm8 > 15 ? 15 : imm8)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srai_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -4964,6 +4972,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_srai_epi32 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsra_w(a, __lasx_xvreplgr2vr_w((imm8 > 31 ? 31 : imm8))); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -4984,8 +4996,6 @@ simde_mm256_srai_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srai_epi32(a, imm8) __lasx_xvsrai_w(a, (imm8 > 31 ? 31 : imm8)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srai_epi32(a, imm8) \ simde_mm256_set_m128i( \ @@ -5183,13 +5193,17 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_srli_epi16 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + if (imm8 > 15) + return simde_mm256_setzero_si256(); + + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsrl_h(a, __lasx_xvreplgr2vr_h(imm8)); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); - if (imm8 > 15) - return simde_mm256_setzero_si256(); - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8)); for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) { @@ -5214,8 +5228,6 @@ simde_mm256_srli_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srli_epi16(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_h(0) : __lasx_xvsrli_h(a, imm8 & 15)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -5231,6 +5243,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_srli_epi32 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsrl_w(a, __lasx_xvreplgr2vr_w(imm8)); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -5253,8 +5269,6 @@ simde_mm256_srli_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srli_epi32(a, imm8) __lasx_xvsrli_w(a, imm8) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi32(a, imm8) \ simde_mm256_set_m128i( \ @@ -5270,6 +5284,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_srli_epi64 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsrl_d(a, __lasx_xvreplgr2vr_d(imm8)); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -5287,8 +5305,6 @@ simde_mm256_srli_epi64 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srli_epi64(a, imm8) __lasx_xvsrli_d(a, imm8) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi64(a, imm8) \ simde_mm256_set_m128i( \ diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index ee50bca3b..feffb0244 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -6163,7 +6163,7 @@ simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0]))); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vslli_h(a_.lsx_i64, count_.u64[0]); + r_.lsx_i64 = __lsx_vsll_h(a_.lsx_i64, __lsx_vreplgr2vr_h(count_.u64[0])); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0)); #else @@ -6199,7 +6199,7 @@ simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0]))); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vslli_w(a_.lsx_i64, count_.u64[0]); + r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, __lsx_vreplgr2vr_w(count_.u64[0])); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0)); #else @@ -6561,7 +6561,9 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_h(a_.lsx_i64, __lsx_vreplgr2vr_h(imm8)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); #else const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8; @@ -6589,8 +6591,6 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8) #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) #define simde_mm_slli_epi16(a, imm8) \ ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_slli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8) @@ -6607,7 +6607,9 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, __lsx_vreplgr2vr_w(imm8)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i32 = a_.i32 << imm8; #else SIMDE_VECTORIZE @@ -6646,8 +6648,6 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8) } \ ret; \ })) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_slli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8) @@ -6664,7 +6664,9 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_d(a_.lsx_i64, __lsx_vreplgr2vr_d(imm8)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i64 = a_.i64 << imm8; #else SIMDE_VECTORIZE @@ -6688,8 +6690,6 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_slli_epi64(a, imm8) \ ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_slli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8) @@ -6706,7 +6706,9 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_h(a_.lsx_i64, __lsx_vreplgr2vr_h(imm8)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); #else SIMDE_VECTORIZE @@ -6733,8 +6735,6 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8) #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) #define simde_mm_srli_epi16(a, imm8) \ ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_srli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8) @@ -6751,7 +6751,9 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_w(a_.lsx_i64, __lsx_vreplgr2vr_w(imm8)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); #else SIMDE_VECTORIZE @@ -6790,8 +6792,6 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8) } \ ret; \ })) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_srli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8) @@ -6810,6 +6810,8 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_d(a_.lsx_i64, __lsx_vreplgr2vr_d(imm8)); #else #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488) r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); @@ -6836,8 +6838,6 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_srli_epi64(a, imm8) \ ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_srli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)