Skip to content

Commit

Permalink
x86 sse2/avx2: rewrite loongarch immediate operand shift instruction
Browse files Browse the repository at this point in the history
The x86 instructions like _mm_srli_epi8() can accept variable or
immediate operand as the second parm, however, the corresponding
loongarch instructions like __lsx_vsrli_b only accept immediate
operand as the second parm, so we need to rewirite them to
avoid compilation error.
  • Loading branch information
HecaiYuan committed Dec 12, 2024
1 parent 6daf535 commit 3afebc4
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 38 deletions.
54 changes: 35 additions & 19 deletions simde/x86/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -4564,6 +4564,10 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
the expected range, hence the discrepancy between what we allow and what
Intel specifies. Some compilers will return 0, others seem to just mask
off everything outside of the range. */
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsll_h(a, __lasx_xvreplgr2vr_h(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -4586,8 +4590,6 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_slli_epi16(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_h(0) : __lasx_xvslli_h(a, imm8 & 15))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_slli_epi16(a, imm8) \
simde_mm256_set_m128i( \
Expand All @@ -4603,6 +4605,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsll_w(a, __lasx_xvreplgr2vr_w(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -4625,8 +4631,6 @@ simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_slli_epi32(a, imm8) (imm8 > 31 ? __lasx_xvreplgr2vr_w(0) : __lasx_xvslli_w(a, imm8 & 31))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_slli_epi32(a, imm8) \
simde_mm256_set_m128i( \
Expand All @@ -4642,6 +4646,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsll_d(a, __lasx_xvreplgr2vr_d(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -4659,8 +4667,6 @@ simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_slli_epi64(a, imm8) (imm8 > 63 ? __lasx_xvreplgr2vr_d(0) : __lasx_xvslli_d(a, imm8))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_slli_epi64(a, imm8) \
simde_mm256_set_m128i( \
Expand Down Expand Up @@ -4927,6 +4933,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_srai_epi16 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsra_h(a, __lasx_xvreplgr2vr_h((imm8 > 15 ? 15 : imm8)));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -4947,8 +4957,6 @@ simde_mm256_srai_epi16 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_srai_epi16(a, imm8) __lasx_xvsrai_h(a, (imm8 > 15 ? 15 : imm8))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_srai_epi16(a, imm8) \
simde_mm256_set_m128i( \
Expand All @@ -4964,6 +4972,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_srai_epi32 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsra_w(a, __lasx_xvreplgr2vr_w((imm8 > 31 ? 31 : imm8)));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -4984,8 +4996,6 @@ simde_mm256_srai_epi32 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_srai_epi32(a, imm8) __lasx_xvsrai_w(a, (imm8 > 31 ? 31 : imm8))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_srai_epi32(a, imm8) \
simde_mm256_set_m128i( \
Expand Down Expand Up @@ -5183,13 +5193,17 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_srli_epi16 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
if (imm8 > 15)
return simde_mm256_setzero_si256();

#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsrl_h(a, __lasx_xvreplgr2vr_h(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);

if (imm8 > 15)
return simde_mm256_setzero_si256();

#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8));
for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) {
Expand All @@ -5214,8 +5228,6 @@ simde_mm256_srli_epi16 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_srli_epi16(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_h(0) : __lasx_xvsrli_h(a, imm8 & 15))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_srli_epi16(a, imm8) \
simde_mm256_set_m128i( \
Expand All @@ -5231,6 +5243,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_srli_epi32 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsrl_w(a, __lasx_xvreplgr2vr_w(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -5253,8 +5269,6 @@ simde_mm256_srli_epi32 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_srli_epi32(a, imm8) __lasx_xvsrli_w(a, imm8)
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_srli_epi32(a, imm8) \
simde_mm256_set_m128i( \
Expand All @@ -5270,6 +5284,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_srli_epi64 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsrl_d(a, __lasx_xvreplgr2vr_d(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -5287,8 +5305,6 @@ simde_mm256_srli_epi64 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_srli_epi64(a, imm8) __lasx_xvsrli_d(a, imm8)
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_srli_epi64(a, imm8) \
simde_mm256_set_m128i( \
Expand Down
38 changes: 19 additions & 19 deletions simde/x86/sse2.h
Original file line number Diff line number Diff line change
Expand Up @@ -6163,7 +6163,7 @@ simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0])));
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vslli_h(a_.lsx_i64, count_.u64[0]);
r_.lsx_i64 = __lsx_vsll_h(a_.lsx_i64, __lsx_vreplgr2vr_h(count_.u64[0]));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
#else
Expand Down Expand Up @@ -6199,7 +6199,7 @@ simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0])));
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vslli_w(a_.lsx_i64, count_.u64[0]);
r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, __lsx_vreplgr2vr_w(count_.u64[0]));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0));
#else
Expand Down Expand Up @@ -6561,7 +6561,9 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8)
r_,
a_ = simde__m128i_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsll_h(a_.lsx_i64, __lsx_vreplgr2vr_h(imm8));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff);
#else
const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
Expand Down Expand Up @@ -6589,8 +6591,6 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8)
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
#define simde_mm_slli_epi16(a, imm8) \
((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_slli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
Expand All @@ -6607,7 +6607,9 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8)
r_,
a_ = simde__m128i_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, __lsx_vreplgr2vr_w(imm8));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.i32 = a_.i32 << imm8;
#else
SIMDE_VECTORIZE
Expand Down Expand Up @@ -6646,8 +6648,6 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8)
} \
ret; \
}))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_slli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
Expand All @@ -6664,7 +6664,9 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8)
r_,
a_ = simde__m128i_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsll_d(a_.lsx_i64, __lsx_vreplgr2vr_d(imm8));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.i64 = a_.i64 << imm8;
#else
SIMDE_VECTORIZE
Expand All @@ -6688,8 +6690,6 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8)
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
#define simde_mm_slli_epi64(a, imm8) \
((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_slli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
Expand All @@ -6706,7 +6706,9 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8)
r_,
a_ = simde__m128i_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsrl_h(a_.lsx_i64, __lsx_vreplgr2vr_h(imm8));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8);
#else
SIMDE_VECTORIZE
Expand All @@ -6733,8 +6735,6 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8)
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
#define simde_mm_srli_epi16(a, imm8) \
((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_srli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
Expand All @@ -6751,7 +6751,9 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8)
r_,
a_ = simde__m128i_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsrl_w(a_.lsx_i64, __lsx_vreplgr2vr_w(imm8));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff);
#else
SIMDE_VECTORIZE
Expand Down Expand Up @@ -6790,8 +6792,6 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8)
} \
ret; \
}))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_srli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
Expand All @@ -6810,6 +6810,8 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8)

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsrl_d(a_.lsx_i64, __lsx_vreplgr2vr_d(imm8));
#else
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8);
Expand All @@ -6836,8 +6838,6 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8)
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
#define simde_mm_srli_epi64(a, imm8) \
((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_srli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
Expand Down

0 comments on commit 3afebc4

Please sign in to comment.