Skip to content

Commit

Permalink
convi/reducev: fix HWY >= 1.1.0 paths on AVX2/AVX3
Browse files Browse the repository at this point in the history
On AVX2 and AVX3 architectures, for vectors of 32 bytes or larger,
the semantics of the `InterleaveWhole{Lower,Upper}` operations do
not align with those of the non-whole variants.

Ensure we only use `InterleaveWhole*` on RVV/SVE.

This partially reverts commit 4246c06.
  • Loading branch information
kleisauke committed Apr 2, 2024
1 parent 8992822 commit ecc12f3
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 36 deletions.
36 changes: 18 additions & 18 deletions libvips/convolution/convi_hwy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
* - implement using ReorderWidenMulAccumulate
* 29/11/22 kleisauke
* - prefer use of RearrangeToOddPlusEven
* 02/10/23 kleisauke
* - prefer use of InterleaveWhole{Lower,Upper}
* 02/10/23 kleisauke
* - prefer use of InterleaveWhole{Lower,Upper} on RVV/SVE
*/

/*
Expand Down Expand Up @@ -71,9 +71,9 @@ constexpr Rebind<uint8_t, DI32> du8x32;
constexpr DI16 di16;
constexpr DI32 di32;

#ifndef HAVE_HWY_1_1_0
#define InterleaveWholeLower InterleaveLower
#define InterleaveWholeUpper InterleaveUpper
#if defined(HAVE_HWY_1_1_0) && \
(HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && HWY_TARGET <= HWY_SVE))
#define Interleave InterleaveWhole
#endif

HWY_ATTR void
Expand Down Expand Up @@ -139,24 +139,24 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
auto bottom = LoadU(du8, /* bottom line */
p + offsets[i + 1]);

auto source = InterleaveWholeLower(top, bottom);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));
auto source = InterleaveLower(top, bottom);
auto pix = BitCast(di16, InterleaveLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveWholeUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveWholeLower(source, zero));
source = InterleaveUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand All @@ -168,24 +168,24 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
*/
auto top = LoadU(du8, p + offsets[i]);

auto source = InterleaveWholeLower(top, zero);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));
auto source = InterleaveLower(top, zero);
auto pix = BitCast(di16, InterleaveLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveWholeUpper(du8, top, zero);
pix = BitCast(di16, InterleaveWholeLower(source, zero));
source = InterleaveUpper(du8, top, zero);
pix = BitCast(di16, InterleaveLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand Down Expand Up @@ -266,7 +266,7 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
auto bottom = LoadU(du8x16, /* bottom line */
p + offsets[i + 1]);

auto source = InterleaveWholeLower(top, bottom);
auto source = InterleaveLower(top, bottom);
auto pix = PromoteTo(di16, source);

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
Expand Down
36 changes: 18 additions & 18 deletions libvips/resample/reducev_hwy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
* - implement using ReorderWidenMulAccumulate
* 29/11/22 kleisauke
* - prefer use of RearrangeToOddPlusEven
* 02/10/23 kleisauke
* - prefer use of InterleaveWhole{Lower,Upper}
* 02/10/23 kleisauke
* - prefer use of InterleaveWhole{Lower,Upper} on RVV/SVE
*/

/*
Expand Down Expand Up @@ -71,9 +71,9 @@ constexpr Rebind<uint8_t, DI32> du8x32;
constexpr DI16 di16;
constexpr DI32 di32;

#ifndef HAVE_HWY_1_1_0
#define InterleaveWholeLower InterleaveLower
#define InterleaveWholeUpper InterleaveUpper
#if defined(HAVE_HWY_1_1_0) && \
(HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && HWY_TARGET <= HWY_SVE))
#define Interleave InterleaveWhole
#endif

HWY_ATTR void
Expand Down Expand Up @@ -133,24 +133,24 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
auto bottom = LoadU(du8, p); /* bottom line */
p += l1;

auto source = InterleaveWholeLower(top, bottom);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));
auto source = InterleaveLower(top, bottom);
auto pix = BitCast(di16, InterleaveLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveWholeUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveWholeLower(source, zero));
source = InterleaveUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand All @@ -161,24 +161,24 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
auto top = LoadU(du8, p);
p += l1;

auto source = InterleaveWholeLower(top, zero);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));
auto source = InterleaveLower(top, zero);
auto pix = BitCast(di16, InterleaveLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveWholeUpper(du8, top, zero);
pix = BitCast(di16, InterleaveWholeLower(source, zero));
source = InterleaveUpper(du8, top, zero);
pix = BitCast(di16, InterleaveLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand Down Expand Up @@ -254,7 +254,7 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
auto bottom = LoadU(du8x16, p); /* bottom line */
p += l1;

auto source = InterleaveWholeLower(top, bottom);
auto source = InterleaveLower(top, bottom);
auto pix = PromoteTo(di16, source);

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
Expand Down

0 comments on commit ecc12f3

Please sign in to comment.