From ecc12f3dd580ef8c05094eaac1882d936b7a3de2 Mon Sep 17 00:00:00 2001 From: Kleis Auke Wolthuizen Date: Tue, 2 Apr 2024 17:06:30 +0200 Subject: [PATCH] convi/reducev: fix HWY >= 1.1.0 paths on AVX2/AVX3 On AVX2 and AVX3 architectures, for vectors of 32 bytes or larger, the semantics of the `InterleaveWhole{Lower,Upper}` operations do not align with those of the non-whole variants. Ensure we only use `InterleaveWhole*` on RVV/SVE. This partially reverts commit 4246c062f290f1c02b54dfd769b37254b4a6f843. --- libvips/convolution/convi_hwy.cpp | 36 +++++++++++++++---------------- libvips/resample/reducev_hwy.cpp | 36 +++++++++++++++---------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/libvips/convolution/convi_hwy.cpp b/libvips/convolution/convi_hwy.cpp index ef9318f929..117e7dcad0 100644 --- a/libvips/convolution/convi_hwy.cpp +++ b/libvips/convolution/convi_hwy.cpp @@ -4,8 +4,8 @@ * - implement using ReorderWidenMulAccumulate * 29/11/22 kleisauke * - prefer use of RearrangeToOddPlusEven - * 02/10/23 kleisauke - * - prefer use of InterleaveWhole{Lower,Upper} +* 02/10/23 kleisauke +* - prefer use of InterleaveWhole{Lower,Upper} on RVV/SVE */ /* @@ -71,9 +71,9 @@ constexpr Rebind du8x32; constexpr DI16 di16; constexpr DI32 di32; -#ifndef HAVE_HWY_1_1_0 -#define InterleaveWholeLower InterleaveLower -#define InterleaveWholeUpper InterleaveUpper +#if defined(HAVE_HWY_1_1_0) && \ + (HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && HWY_TARGET <= HWY_SVE)) +#define Interleave InterleaveWhole #endif HWY_ATTR void @@ -139,24 +139,24 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r, auto bottom = LoadU(du8, /* bottom line */ p + offsets[i + 1]); - auto source = InterleaveWholeLower(top, bottom); - auto pix = BitCast(di16, InterleaveWholeLower(source, zero)); + auto source = InterleaveLower(top, bottom); + auto pix = BitCast(di16, InterleaveLower(source, zero)); sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0, /* byref */ sum1); - pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero)); + pix = BitCast(di16, InterleaveUpper(du8, source, zero)); sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2, /* byref */ sum3); - source = InterleaveWholeUpper(du8, top, bottom); - pix = BitCast(di16, InterleaveWholeLower(source, zero)); + source = InterleaveUpper(du8, top, bottom); + pix = BitCast(di16, InterleaveLower(source, zero)); sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4, /* byref */ sum5); - pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero)); + pix = BitCast(di16, InterleaveUpper(du8, source, zero)); sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6, /* byref */ sum7); @@ -168,24 +168,24 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r, */ auto top = LoadU(du8, p + offsets[i]); - auto source = InterleaveWholeLower(top, zero); - auto pix = BitCast(di16, InterleaveWholeLower(source, zero)); + auto source = InterleaveLower(top, zero); + auto pix = BitCast(di16, InterleaveLower(source, zero)); sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0, /* byref */ sum1); - pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero)); + pix = BitCast(di16, InterleaveUpper(du8, source, zero)); sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2, /* byref */ sum3); - source = InterleaveWholeUpper(du8, top, zero); - pix = BitCast(di16, InterleaveWholeLower(source, zero)); + source = InterleaveUpper(du8, top, zero); + pix = BitCast(di16, InterleaveLower(source, zero)); sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4, /* byref */ sum5); - pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero)); + pix = BitCast(di16, InterleaveUpper(du8, source, zero)); sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6, /* byref */ sum7); @@ -266,7 +266,7 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r, auto bottom = LoadU(du8x16, /* bottom line */ p + offsets[i + 1]); - auto source = InterleaveWholeLower(top, bottom); + auto source = InterleaveLower(top, bottom); auto pix = PromoteTo(di16, source); sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0, diff --git a/libvips/resample/reducev_hwy.cpp b/libvips/resample/reducev_hwy.cpp index c3bc9ecf8f..7ee9231a70 100644 --- a/libvips/resample/reducev_hwy.cpp +++ b/libvips/resample/reducev_hwy.cpp @@ -4,8 +4,8 @@ * - implement using ReorderWidenMulAccumulate * 29/11/22 kleisauke * - prefer use of RearrangeToOddPlusEven - * 02/10/23 kleisauke - * - prefer use of InterleaveWhole{Lower,Upper} +* 02/10/23 kleisauke +* - prefer use of InterleaveWhole{Lower,Upper} on RVV/SVE */ /* @@ -71,9 +71,9 @@ constexpr Rebind du8x32; constexpr DI16 di16; constexpr DI32 di32; -#ifndef HAVE_HWY_1_1_0 -#define InterleaveWholeLower InterleaveLower -#define InterleaveWholeUpper InterleaveUpper +#if defined(HAVE_HWY_1_1_0) && \ + (HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && HWY_TARGET <= HWY_SVE)) +#define Interleave InterleaveWhole #endif HWY_ATTR void @@ -133,24 +133,24 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin, auto bottom = LoadU(du8, p); /* bottom line */ p += l1; - auto source = InterleaveWholeLower(top, bottom); - auto pix = BitCast(di16, InterleaveWholeLower(source, zero)); + auto source = InterleaveLower(top, bottom); + auto pix = BitCast(di16, InterleaveLower(source, zero)); sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0, /* byref */ sum1); - pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero)); + pix = BitCast(di16, InterleaveUpper(du8, source, zero)); sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2, /* byref */ sum3); - source = InterleaveWholeUpper(du8, top, bottom); - pix = BitCast(di16, InterleaveWholeLower(source, zero)); + source = InterleaveUpper(du8, top, bottom); + pix = BitCast(di16, InterleaveLower(source, zero)); sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4, /* byref */ sum5); - pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero)); + pix = BitCast(di16, InterleaveUpper(du8, source, zero)); sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6, /* byref */ sum7); @@ -161,24 +161,24 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin, auto top = LoadU(du8, p); p += l1; - auto source = InterleaveWholeLower(top, zero); - auto pix = BitCast(di16, InterleaveWholeLower(source, zero)); + auto source = InterleaveLower(top, zero); + auto pix = BitCast(di16, InterleaveLower(source, zero)); sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0, /* byref */ sum1); - pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero)); + pix = BitCast(di16, InterleaveUpper(du8, source, zero)); sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2, /* byref */ sum3); - source = InterleaveWholeUpper(du8, top, zero); - pix = BitCast(di16, InterleaveWholeLower(source, zero)); + source = InterleaveUpper(du8, top, zero); + pix = BitCast(di16, InterleaveLower(source, zero)); sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4, /* byref */ sum5); - pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero)); + pix = BitCast(di16, InterleaveUpper(du8, source, zero)); sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6, /* byref */ sum7); @@ -254,7 +254,7 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin, auto bottom = LoadU(du8x16, p); /* bottom line */ p += l1; - auto source = InterleaveWholeLower(top, bottom); + auto source = InterleaveLower(top, bottom); auto pix = PromoteTo(di16, source); sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,