Skip to content

Commit

Permalink
NEON: properly implement _high intrinsics
Browse files Browse the repository at this point in the history
High intrinsics merely have an implicit vget_high or vcombine. There is no
need to complicate them further.
  • Loading branch information
easyaspi314 authored and mr-c committed Jun 11, 2023
1 parent 11a6182 commit c3aa2c6
Show file tree
Hide file tree
Showing 9 changed files with 68 additions and 324 deletions.
17 changes: 8 additions & 9 deletions simde/arm/neon/addl_high.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@
#if !defined(SIMDE_ARM_NEON_ADDL_HIGH_H)
#define SIMDE_ARM_NEON_ADDL_HIGH_H

#include "add.h"
#include "movl.h"
#include "movl_high.h"
#include "addl.h"
#include "get_high.h"
#include "types.h"

HEDLEY_DIAGNOSTIC_PUSH
Expand All @@ -43,7 +42,7 @@ simde_vaddl_high_s8(simde_int8x16_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s8(a, b);
#else
return simde_vaddq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b));
return simde_vaddl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -57,7 +56,7 @@ simde_vaddl_high_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s16(a, b);
#else
return simde_vaddq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b));
return simde_vaddl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -71,7 +70,7 @@ simde_vaddl_high_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s32(a, b);
#else
return simde_vaddq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b));
return simde_vaddl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -85,7 +84,7 @@ simde_vaddl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u8(a, b);
#else
return simde_vaddq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b));
return simde_vaddl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -99,7 +98,7 @@ simde_vaddl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u16(a, b);
#else
return simde_vaddq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b));
return simde_vaddl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -113,7 +112,7 @@ simde_vaddl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u32(a, b);
#else
return simde_vaddq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b));
return simde_vaddl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand Down
82 changes: 8 additions & 74 deletions simde/arm/neon/addw_high.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
#define SIMDE_ARM_NEON_ADDW_HIGH_H

#include "types.h"
#include "movl_high.h"
#include "add.h"
#include "get_high.h"
#include "addw.h"

HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
Expand All @@ -40,19 +40,8 @@ simde_int16x8_t
simde_vaddw_high_s8(simde_int16x8_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_s8(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_s16(a, simde_vmovl_high_s8(b));
#else
simde_int16x8_private r_;
simde_int16x8_private a_ = simde_int16x8_to_private(a);
simde_int8x16_private b_ = simde_int8x16_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_int16x8_from_private(r_);
return simde_vaddw_s8(a, simde_vget_high_s8(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -65,19 +54,8 @@ simde_int32x4_t
simde_vaddw_high_s16(simde_int32x4_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_s16(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_s32(a, simde_vmovl_high_s16(b));
#else
simde_int32x4_private r_;
simde_int32x4_private a_ = simde_int32x4_to_private(a);
simde_int16x8_private b_ = simde_int16x8_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_int32x4_from_private(r_);
return simde_vaddw_s16(a, simde_vget_high_s16(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -90,19 +68,8 @@ simde_int64x2_t
simde_vaddw_high_s32(simde_int64x2_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_s32(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_s64(a, simde_vmovl_high_s32(b));
#else
simde_int64x2_private r_;
simde_int64x2_private a_ = simde_int64x2_to_private(a);
simde_int32x4_private b_ = simde_int32x4_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_int64x2_from_private(r_);
return simde_vaddw_s32(a, simde_vget_high_s32(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -115,19 +82,8 @@ simde_uint16x8_t
simde_vaddw_high_u8(simde_uint16x8_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_u8(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_u16(a, simde_vmovl_high_u8(b));
#else
simde_uint16x8_private r_;
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
simde_uint8x16_private b_ = simde_uint8x16_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_uint16x8_from_private(r_);
return simde_vaddw_u8(a, simde_vget_high_u8(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -140,19 +96,8 @@ simde_uint32x4_t
simde_vaddw_high_u16(simde_uint32x4_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_u16(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_u32(a, simde_vmovl_high_u16(b));
#else
simde_uint32x4_private r_;
simde_uint32x4_private a_ = simde_uint32x4_to_private(a);
simde_uint16x8_private b_ = simde_uint16x8_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_uint32x4_from_private(r_);
return simde_vaddw_u16(a, simde_vget_high_u16(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -165,19 +110,8 @@ simde_uint64x2_t
simde_vaddw_high_u32(simde_uint64x2_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_u32(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_u64(a, simde_vmovl_high_u32(b));
#else
simde_uint64x2_private r_;
simde_uint64x2_private a_ = simde_uint64x2_to_private(a);
simde_uint32x4_private b_ = simde_uint32x4_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_uint64x2_from_private(r_);
return simde_vaddw_u32(a, simde_vget_high_u32(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand Down
46 changes: 8 additions & 38 deletions simde/arm/neon/mlal_high.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
#if !defined(SIMDE_ARM_NEON_MLAL_HIGH_H)
#define SIMDE_ARM_NEON_MLAL_HIGH_H

#include "movl_high.h"
#include "mla.h"
#include "get_high.h"
#include "mlal.h"
#include "types.h"

HEDLEY_DIAGNOSTIC_PUSH
Expand All @@ -42,7 +42,7 @@ simde_vmlal_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_s8(a, b, c);
#else
return simde_vmlaq_s16(a, simde_vmovl_high_s8(b), simde_vmovl_high_s8(c));
return simde_vmlal_s8(a, simde_vget_high_s8(b), simde_vget_high_s8(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -56,7 +56,7 @@ simde_vmlal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_s16(a, b, c);
#else
return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vmovl_high_s16(c));
return simde_vmlal_s16(a, simde_vget_high_s16(b), simde_vget_high_s16(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -70,22 +70,7 @@ simde_vmlal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_s32(a, b, c);
#else
simde_int64x2_private
r_,
a_ = simde_int64x2_to_private(a),
b_ = simde_int64x2_to_private(simde_vmovl_high_s32(b)),
c_ = simde_int64x2_to_private(simde_vmovl_high_s32(c));

#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = (b_.values * c_.values) + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
}
#endif

return simde_int64x2_from_private(r_);
return simde_vmlal_s32(a, simde_vget_high_s32(b), simde_vget_high_s32(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -99,7 +84,7 @@ simde_vmlal_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c)
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_u8(a, b, c);
#else
return simde_vmlaq_u16(a, simde_vmovl_high_u8(b), simde_vmovl_high_u8(c));
return simde_vmlal_u8(a, simde_vget_high_u8(b), simde_vget_high_u8(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -113,7 +98,7 @@ simde_vmlal_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c)
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_u16(a, b, c);
#else
return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vmovl_high_u16(c));
return simde_vmlal_u16(a, simde_vget_high_u16(b), simde_vget_high_u16(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -127,22 +112,7 @@ simde_vmlal_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c)
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_u32(a, b, c);
#else
simde_uint64x2_private
r_,
a_ = simde_uint64x2_to_private(a),
b_ = simde_uint64x2_to_private(simde_vmovl_high_u32(b)),
c_ = simde_uint64x2_to_private(simde_vmovl_high_u32(c));

#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = (b_.values * c_.values) + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
}
#endif

return simde_uint64x2_from_private(r_);
return simde_vmlal_u32(a, simde_vget_high_u32(b), simde_vget_high_u32(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand Down
43 changes: 6 additions & 37 deletions simde/arm/neon/mlal_high_n.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@
#if !defined(SIMDE_ARM_NEON_MLAL_HIGH_N_H)
#define SIMDE_ARM_NEON_MLAL_HIGH_N_H

#include "movl_high.h"
#include "dup_n.h"
#include "mla.h"
#include "get_high.h"
#include "mlal_n.h"
#include "types.h"

HEDLEY_DIAGNOSTIC_PUSH
Expand All @@ -42,7 +41,7 @@ simde_vmlal_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_n_s16(a, b, c);
#else
return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vdupq_n_s32(c));
return simde_vmlal_n_s16(a, simde_vget_high_s16(b), c);
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -56,22 +55,7 @@ simde_vmlal_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_n_s32(a, b, c);
#else
simde_int64x2_private
r_,
a_ = simde_int64x2_to_private(a),
b_ = simde_int64x2_to_private(simde_vmovl_high_s32(b)),
c_ = simde_int64x2_to_private(simde_vdupq_n_s64(c));

#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = (b_.values * c_.values) + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
}
#endif

return simde_int64x2_from_private(r_);
return simde_vmlal_n_s32(a, simde_vget_high_s32(b), c);
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -85,7 +69,7 @@ simde_vmlal_high_n_u16(simde_uint32x4_t a, simde_uint16x8_t b, uint16_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_n_u16(a, b, c);
#else
return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vdupq_n_u32(c));
return simde_vmlal_n_u16(a, simde_vget_high_u16(b), c);
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -99,22 +83,7 @@ simde_vmlal_high_n_u32(simde_uint64x2_t a, simde_uint32x4_t b, uint32_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_n_u32(a, b, c);
#else
simde_uint64x2_private
r_,
a_ = simde_uint64x2_to_private(a),
b_ = simde_uint64x2_to_private(simde_vmovl_high_u32(b)),
c_ = simde_uint64x2_to_private(simde_vdupq_n_u64(c));

#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = (b_.values * c_.values) + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
}
#endif

return simde_uint64x2_from_private(r_);
return simde_vmlal_n_u32(a, simde_vget_high_u32(b), c);
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand Down
Loading

0 comments on commit c3aa2c6

Please sign in to comment.