Skip to content

Commit

Permalink
[WiP] Prefer use of InterleaveWhole{Lower,Upper}
Browse files Browse the repository at this point in the history
Depends on commit:
google/highway@bad65ea
  • Loading branch information
kleisauke committed Oct 2, 2023
1 parent 3b48a7c commit d7cfe96
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 43 deletions.
35 changes: 15 additions & 20 deletions libvips/convolution/convi_hwy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
* - implement using ReorderWidenMulAccumulate
* 29/11/22 kleisauke
* - prefer use of RearrangeToOddPlusEven
* 02/10/23 kleisauke
* - prefer use of InterleaveWhole{Lower,Upper}
*/

/*
Expand Down Expand Up @@ -77,14 +79,7 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
#if HWY_TARGET != HWY_SCALAR
int32_t bo = VIPS_RECT_BOTTOM(r);

#if HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && HWY_TARGET <= HWY_SVE)
/* Ensure we do not cross 128-bit block boundaries on RVV/SVE.
*/
const int32_t N = 16;
#else
const int32_t N = Lanes(du8);
#endif

const auto zero = Zero(du8);
const auto v_exp = Set(di32, 1 << (exp - 1));
const auto v_offset = Set(di32, offset);
Expand Down Expand Up @@ -131,24 +126,24 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
auto bottom = LoadU(du8, /* bottom line */
p + offsets[i + 1]);

auto source = InterleaveLower(top, bottom);
auto pix = BitCast(di16, InterleaveLower(source, zero));
auto source = InterleaveWholeLower(top, bottom);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveUpper(du8, source, zero));
pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveLower(source, zero));
source = InterleaveWholeUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveWholeLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveUpper(du8, source, zero));
pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand All @@ -160,24 +155,24 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
*/
auto top = LoadU(du8, p + offsets[i]);

auto source = InterleaveLower(top, zero);
auto pix = BitCast(di16, InterleaveLower(source, zero));
auto source = InterleaveWholeLower(top, zero);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveUpper(du8, source, zero));
pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveUpper(du8, top, zero);
pix = BitCast(di16, InterleaveLower(source, zero));
source = InterleaveWholeUpper(du8, top, zero);
pix = BitCast(di16, InterleaveWholeLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveUpper(du8, source, zero));
pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand Down Expand Up @@ -258,7 +253,7 @@ vips_convi_uchar_hwy(VipsRegion *out_region, VipsRegion *ir, VipsRect *r,
auto bottom = LoadU(du8x16, /* bottom line */
p + offsets[i + 1]);

auto source = InterleaveLower(top, bottom);
auto source = InterleaveWholeLower(top, bottom);
auto pix = PromoteTo(di16, source);

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
Expand Down
35 changes: 15 additions & 20 deletions libvips/resample/reducev_hwy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
* - implement using ReorderWidenMulAccumulate
* 29/11/22 kleisauke
* - prefer use of RearrangeToOddPlusEven
* 02/10/23 kleisauke
* - prefer use of InterleaveWhole{Lower,Upper}
*/

/*
Expand Down Expand Up @@ -76,14 +78,7 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
#if HWY_TARGET != HWY_SCALAR
const auto l1 = lskip / sizeof(uint8_t);

#if HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && HWY_TARGET <= HWY_SVE)
/* Ensure we do not cross 128-bit block boundaries on RVV/SVE.
*/
const int32_t N = 16;
#else
const int32_t N = Lanes(du8);
#endif

const auto zero = Zero(du8);
const auto initial = Set(di32, VIPS_INTERPOLATE_SCALE >> 1);

Expand Down Expand Up @@ -125,24 +120,24 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
auto bottom = LoadU(du8, p); /* bottom line */
p += l1;

auto source = InterleaveLower(top, bottom);
auto pix = BitCast(di16, InterleaveLower(source, zero));
auto source = InterleaveWholeLower(top, bottom);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveUpper(du8, source, zero));
pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveLower(source, zero));
source = InterleaveWholeUpper(du8, top, bottom);
pix = BitCast(di16, InterleaveWholeLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveUpper(du8, source, zero));
pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand All @@ -153,24 +148,24 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
auto top = LoadU(du8, p);
p += l1;

auto source = InterleaveLower(top, zero);
auto pix = BitCast(di16, InterleaveLower(source, zero));
auto source = InterleaveWholeLower(top, zero);
auto pix = BitCast(di16, InterleaveWholeLower(source, zero));

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
/* byref */ sum1);

pix = BitCast(di16, InterleaveUpper(du8, source, zero));
pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));

sum2 = ReorderWidenMulAccumulate(di32, pix, mmk, sum2,
/* byref */ sum3);

source = InterleaveUpper(du8, top, zero);
pix = BitCast(di16, InterleaveLower(source, zero));
source = InterleaveWholeUpper(du8, top, zero);
pix = BitCast(di16, InterleaveWholeLower(source, zero));

sum4 = ReorderWidenMulAccumulate(di32, pix, mmk, sum4,
/* byref */ sum5);

pix = BitCast(di16, InterleaveUpper(du8, source, zero));
pix = BitCast(di16, InterleaveWholeUpper(du8, source, zero));

sum6 = ReorderWidenMulAccumulate(di32, pix, mmk, sum6,
/* byref */ sum7);
Expand Down Expand Up @@ -246,7 +241,7 @@ vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin,
auto bottom = LoadU(du8x16, p); /* bottom line */
p += l1;

auto source = InterleaveLower(top, bottom);
auto source = InterleaveWholeLower(top, bottom);
auto pix = PromoteTo(di16, source);

sum0 = ReorderWidenMulAccumulate(di32, pix, mmk, sum0,
Expand Down
6 changes: 3 additions & 3 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -435,9 +435,9 @@ if libopenjp2_dep.found()
cfg_var.set('HAVE_LIBOPENJP2', '1')
endif

# Require 1.0.5 to support the `ReorderDemote2To(u8, i16, i16)` operation
# See: https://github.com/google/highway/pull/1247
libhwy_dep = dependency('libhwy', version: '>=1.0.5', required: get_option('highway'))
# TODO(kleisauke): Bump version once a new upstream release is available - see:
# https://github.com/google/highway/pull/1766
libhwy_dep = dependency('libhwy', version: '>=1.0.7', required: get_option('highway'))
if libhwy_dep.found()
libvips_deps += libhwy_dep
cfg_var.set('HAVE_HWY', '1')
Expand Down

0 comments on commit d7cfe96

Please sign in to comment.