diff --git a/libvips/resample/meson.build b/libvips/resample/meson.build index 1e75edd75..22095df62 100644 --- a/libvips/resample/meson.build +++ b/libvips/resample/meson.build @@ -8,6 +8,7 @@ resample_sources = files( 'resize.c', 'shrink.c', 'shrinkh.c', + 'shrinkh_hwy.cpp', 'shrinkv.c', 'shrinkv_hwy.cpp', 'reduce.c', diff --git a/libvips/resample/presample.h b/libvips/resample/presample.h index 157bfbc75..56c436e9f 100644 --- a/libvips/resample/presample.h +++ b/libvips/resample/presample.h @@ -78,6 +78,8 @@ void vips_reduceh_uchar_hwy(VipsPel *pout, VipsPel *pin, void vips_reducev_uchar_hwy(VipsPel *pout, VipsPel *pin, int n, int ne, int lskip, const short *restrict k); +void vips_shrinkh_uchar_hwy(VipsPel *pout, VipsPel *pin, + int width, int hshrink, int bands); void vips_shrinkv_uchar_hwy(VipsPel *pout, VipsPel *pin, int ne, int vshrink, int lskip); diff --git a/libvips/resample/shrinkh.c b/libvips/resample/shrinkh.c index 574a44cf5..4ff9706b4 100644 --- a/libvips/resample/shrinkh.c +++ b/libvips/resample/shrinkh.c @@ -55,6 +55,7 @@ #include #include +#include #include #include @@ -262,6 +263,73 @@ vips_shrinkh_gen(VipsRegion *out_region, return 0; } +#ifdef HAVE_HWY +static int +vips_shrinkh_uchar_vector_gen(VipsRegion *out_region, + void *seq, void *a, void *b, gboolean *stop) +{ + /* How do we chunk up the image? We don't want to prepare the whole of + * the input region corresponding to *r since it could be huge. + * + * Reading a line at a time could cause a lot of overcomputation, depending + * on what's upstream from us. In SMALLTILE, output scanlines could be + * quite small. + * + * Use fatstrip height as a compromise. + */ + const int dy = vips__fatstrip_height; + + VipsImage *in = (VipsImage *) a; + VipsShrinkh *shrink = (VipsShrinkh *) b; + VipsRegion *ir = (VipsRegion *) seq; + VipsRect *r = &out_region->valid; + const int bands = in->Bands; + + int y, y1; + +#ifdef DEBUG + printf("vips_shrinkh_uchar_vector_gen: generating %d x %d at %d x %d\n", + r->width, r->height, r->left, r->top); +#endif /*DEBUG*/ + + for (y = 0; y < r->height; y += dy) { + int chunk_height = VIPS_MIN(dy, r->height - y); + + VipsRect s; + + s.left = r->left * shrink->hshrink; + s.top = r->top + y; + s.width = r->width * shrink->hshrink; + s.height = chunk_height; +#ifdef DEBUG + printf("vips_shrinkh_uchar_vector_gen: requesting %d lines from %d\n", + s.height, s.top); +#endif /*DEBUG*/ + if (vips_region_prepare(ir, &s)) + return -1; + + VIPS_GATE_START("vips_shrinkh_uchar_vector_gen: work"); + + // each output line + for (y1 = 0; y1 < chunk_height; y1++) { + // top of this line in the output + int top = r->top + y + y1; + + VipsPel *q = VIPS_REGION_ADDR(out_region, r->left, top); + VipsPel *p = VIPS_REGION_ADDR(ir, s.left, top); + + vips_shrinkh_uchar_hwy(q, p, r->width, shrink->hshrink, bands); + } + + VIPS_GATE_STOP("vips_shrinkh_uchar_vector_gen: work"); + } + + VIPS_COUNT_PIXELS(out_region, "vips_shrinkh_uchar_vector_gen"); + + return 0; +} +#endif /*HAVE_HWY*/ + static int vips_shrinkh_build(VipsObject *object) { @@ -272,6 +340,7 @@ vips_shrinkh_build(VipsObject *object) vips_object_local_array(object, 2); VipsImage *in; + VipsGenerateFn generate; if (VIPS_OBJECT_CLASS(vips_shrinkh_parent_class)->build(object)) return -1; @@ -298,6 +367,20 @@ vips_shrinkh_build(VipsObject *object) return -1; in = t[1]; + /* For uchar input, try to make a vector path. + */ +#ifdef HAVE_HWY + if (in->BandFmt == VIPS_FORMAT_UCHAR && + vips_vector_isenabled()) { + generate = vips_shrinkh_uchar_vector_gen; + g_info("shrinkh: using vector path"); + } + else +#endif /*HAVE_HWY*/ + /* Default to the C path. + */ + generate = vips_shrinkh_gen; + if (vips_image_pipelinev(resample->out, VIPS_DEMAND_STYLE_THINSTRIP, in, NULL)) return -1; @@ -324,7 +407,7 @@ vips_shrinkh_build(VipsObject *object) #endif /*DEBUG*/ if (vips_image_generate(resample->out, - vips_start_one, vips_shrinkh_gen, vips_stop_one, + vips_start_one, generate, vips_stop_one, in, shrink)) return -1; diff --git a/libvips/resample/shrinkh_hwy.cpp b/libvips/resample/shrinkh_hwy.cpp new file mode 100644 index 000000000..41df4b132 --- /dev/null +++ b/libvips/resample/shrinkh_hwy.cpp @@ -0,0 +1,129 @@ +/* 15/11/24 kleisauke + * - from shrinkv_hwy.cpp + */ + +/* + + This file is part of VIPS. + + VIPS is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301 USA + + */ + +/* + + These files are distributed with VIPS - http://www.vips.ecs.soton.ac.uk + + */ + +#ifdef HAVE_CONFIG_H +#include +#endif /*HAVE_CONFIG_H*/ +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "presample.h" + +#ifdef HAVE_HWY + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "libvips/resample/shrinkh_hwy.cpp" +#include +#include + +namespace HWY_NAMESPACE { + +using namespace hwy::HWY_NAMESPACE; + +using DU32 = ScalableTag; +constexpr Rebind du8x32; +constexpr DU32 du32; + +constexpr int64_t max_uint32 = 1LL << 32; +constexpr int32_t max_bits = 1 << 8; + +HWY_ATTR void +vips_shrinkh_uchar_hwy(VipsPel *pout, VipsPel *pin, + int32_t width, int32_t hshrink, int32_t bands) +{ +#if HWY_TARGET != HWY_SCALAR + const auto multiplier = Set(du32, max_uint32 / (max_bits * hshrink)); + const auto amend = Set(du32, hshrink / 2); + + int32_t ix = 0; + + for (int32_t x = 0; x < width; ++x) { + auto *HWY_RESTRICT p = (uint8_t *) pin + ix * bands; + auto *HWY_RESTRICT q = (uint8_t *) pout + x * bands; + + auto sum0 = amend; + + int32_t xx = 0; + for (; xx + 2 <= hshrink; xx += 2) { + auto pix0 = PromoteTo(du32, LoadU(du8x32, p)); + p += bands; + auto pix1 = PromoteTo(du32, LoadU(du8x32, p)); + p += bands; + + pix0 = Add(pix0, pix1); + sum0 = Add(sum0, pix0); + } + for (; xx < hshrink; ++xx) { + auto pix0 = PromoteTo(du32, LoadU(du8x32, p)); + p += bands; + + sum0 = Add(sum0, pix0); + } + + sum0 = Mul(sum0, multiplier); + + /* The final 32->8 conversion. + */ + sum0 = ShiftRight<24>(sum0); + + auto demoted = DemoteTo(du8x32, sum0); + StoreU(demoted, du8x32, q); + + ix += hshrink; + } +#endif +} + +} /*namespace HWY_NAMESPACE*/ + +#if HWY_ONCE +HWY_EXPORT(vips_shrinkh_uchar_hwy); + +void +vips_shrinkh_uchar_hwy(VipsPel *pout, VipsPel *pin, + int width, int hshrink, int bands) +{ + /* clang-format off */ + HWY_DYNAMIC_DISPATCH(vips_shrinkh_uchar_hwy)(pout, pin, + width, hshrink, bands); + /* clang-format on */ +} +#endif /*HWY_ONCE*/ + +#endif /*HAVE_HWY*/