Skip to content

Commit

Permalink
convi/reducev: fix HWY >= 1.1.0 paths on AVX2/AVX3 (libvips#3920)
Browse files Browse the repository at this point in the history
On AVX2 and AVX3 architectures, for vectors of 32 bytes or larger,
the semantics of the `InterleaveWhole{Lower,Upper}` operations do
not align with the non-whole variants.

Ensure we only use `InterleaveWhole*` on RVV/SVE.

This partially reverts commit 9924904.
  • Loading branch information
kleisauke authored Apr 6, 2024
1 parent 3ae2933 commit 51a3958
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 34 deletions.
38 changes: 21 additions & 17 deletions libvips/convolution/convi_hwy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
* 29/11/22 kleisauke
* - prefer use of RearrangeToOddPlusEven
* 02/10/23 kleisauke
* - prefer use of InterleaveWhole{Lower,Upper}
* - prefer use of InterleaveWhole{Lower,Upper} on RVV/SVE
*/

/*
Expand Down Expand Up @@ -71,9 +71,10 @@ constexpr Rebind<uint8_t, DI32> du8x32;
constexpr DI16 di16;
constexpr DI32 di32;

#ifndef HAVE_HWY_1_1_0
#define InterleaveWholeLower InterleaveLower
#define InterleaveWholeUpper InterleaveUpper
#if defined(HAVE_HWY_1_1_0) && \
(HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && HWY_TARGET <= HWY_SVE))
#define InterleaveLower InterleaveWholeLower
#define InterleaveUpper InterleaveWholeUpper
#endif

HWY_ATTR void
Expand Down Expand Up @@ -139,24 +140,24 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
auto bottom = LoadU(du8, /* bottom line */
p + offsets[i + 1]);

auto source = InterleaveWholeLower(top, bottom);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));
auto source = InterleaveLower(top, bottom);
auto pix = BitCast(di16, InterleaveLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveWholeUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveWholeLower(source, zero));
source = InterleaveUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand All @@ -168,24 +169,24 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
*/
auto top = LoadU(du8, p + offsets[i]);

auto source = InterleaveWholeLower(top, zero);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));
auto source = InterleaveLower(top, zero);
auto pix = BitCast(di16, InterleaveLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveWholeUpper(du8, top, zero);
pix = BitCast(di16, InterleaveWholeLower(source, zero));
source = InterleaveUpper(du8, top, zero);
pix = BitCast(di16, InterleaveLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand Down Expand Up @@ -266,7 +267,7 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
auto bottom = LoadU(du8x16, /* bottom line */
p + offsets[i + 1]);

auto source = InterleaveWholeLower(top, bottom);
auto source = InterleaveLower(top, bottom);
auto pix = PromoteTo(di16, source);

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
Expand Down Expand Up @@ -305,6 +306,9 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
#endif
}

#undef InterleaveLower
#undef InterleaveUpper

} /*namespace HWY_NAMESPACE*/

#if HWY_ONCE
Expand Down
38 changes: 21 additions & 17 deletions libvips/resample/reducev_hwy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
* 29/11/22 kleisauke
* - prefer use of RearrangeToOddPlusEven
* 02/10/23 kleisauke
* - prefer use of InterleaveWhole{Lower,Upper}
* - prefer use of InterleaveWhole{Lower,Upper} on RVV/SVE
*/

/*
Expand Down Expand Up @@ -71,9 +71,10 @@ constexpr Rebind<uint8_t, DI32> du8x32;
constexpr DI16 di16;
constexpr DI32 di32;

#ifndef HAVE_HWY_1_1_0
#define InterleaveWholeLower InterleaveLower
#define InterleaveWholeUpper InterleaveUpper
#if defined(HAVE_HWY_1_1_0) && \
(HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && HWY_TARGET <= HWY_SVE))
#define InterleaveLower InterleaveWholeLower
#define InterleaveUpper InterleaveWholeUpper
#endif

HWY_ATTR void
Expand Down Expand Up @@ -133,24 +134,24 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
auto bottom = LoadU(du8, p); /* bottom line */
p += l1;

auto source = InterleaveWholeLower(top, bottom);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));
auto source = InterleaveLower(top, bottom);
auto pix = BitCast(di16, InterleaveLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveWholeUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveWholeLower(source, zero));
source = InterleaveUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand All @@ -161,24 +162,24 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
auto top = LoadU(du8, p);
p += l1;

auto source = InterleaveWholeLower(top, zero);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));
auto source = InterleaveLower(top, zero);
auto pix = BitCast(di16, InterleaveLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveWholeUpper(du8, top, zero);
pix = BitCast(di16, InterleaveWholeLower(source, zero));
source = InterleaveUpper(du8, top, zero);
pix = BitCast(di16, InterleaveLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));
pix = BitCast(di16, InterleaveUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand Down Expand Up @@ -254,7 +255,7 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
auto bottom = LoadU(du8x16, p); /* bottom line */
p += l1;

auto source = InterleaveWholeLower(top, bottom);
auto source = InterleaveLower(top, bottom);
auto pix = PromoteTo(di16, source);

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
Expand Down Expand Up @@ -289,6 +290,9 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
#endif
}

#undef InterleaveLower
#undef InterleaveUpper

} /*namespace HWY_NAMESPACE*/

#if HWY_ONCE
Expand Down

0 comments on commit 51a3958

Please sign in to comment.