diff --git a/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp index 28bbb8da2334..dcc78ea1da1c 100644 --- a/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp @@ -32,6 +32,32 @@ multiplyAdd(const T * a, const T * b, size_t sz) noexcept return sum; } +template +double +squaredEuclideanDistanceT(const T * a, const T * b, size_t sz) noexcept +{ + T partial[UNROLL]; + for (size_t i(0); i < UNROLL; i++) { + partial[i] = 0; + } + size_t i(0); + for (; i + UNROLL <= sz; i += UNROLL) { + for (size_t j(0); j < UNROLL; j++) { + T d = a[i+j] - b[i+j]; + partial[j] += d * d; + } + } + for (;i < sz; i++) { + T d = a[i] - b[i]; + partial[i%UNROLL] += d * d; + } + double sum(0); + for (size_t j(0); j < UNROLL; j++) { + sum += partial[j]; + } + return sum; +} + template void bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes) noexcept { @@ -143,12 +169,12 @@ GenericAccelrator::squaredEuclideanDistance(const int8_t * a, const int8_t * b, double GenericAccelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept { - return helper::squared_euclidean_distance_unrolled(a, b, sz); + return squaredEuclideanDistanceT(a, b, sz); } double GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept { - return helper::squared_euclidean_distance_unrolled(a, b, sz); + return squaredEuclideanDistanceT(a, b, sz); } void diff --git a/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp index e1c0ec923eca..f7a6473df476 100644 --- a/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp @@ -84,49 +84,30 @@ orChunks(size_t offset, const std::vector> &src, v } } -template -inline double squared_euclidean_distance_unrolled(const T* a, const T* b, size_t sz) noexcept { - // Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float - PartialT partial[UNROLL]; - for (size_t i = 0; i < UNROLL; ++i) { - partial[i] = 0; - } - size_t i = 0; - for (; i + UNROLL <= sz; i += UNROLL) { - for (size_t j = 0; j < UNROLL; ++j) { - ConvertElementT d = ConvertElementT(a[i+j]) - ConvertElementT(b[i+j]); - partial[j] += d * d; - } - } - for (; i < sz; ++i) { - ConvertElementT d = ConvertElementT(a[i]) - ConvertElementT(b[i]); - partial[i % UNROLL] += d * d; - } - double sum = 0; - for (size_t j = 0; j < UNROLL; ++j) { - sum += partial[j]; +template +double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) __attribute__((noinline)); + +template +double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) { + //Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float + TemporaryT sum = 0; + for (size_t i(0); i < sz; i++) { + int16_t d = int16_t(a[i]) - int16_t(b[i]); + sum += d * d; } return sum; } -template -double squared_euclidean_distance_unrolled_noinline(const T* a, const T* b, size_t sz) noexcept __attribute__((noinline)); - -template -double squared_euclidean_distance_unrolled_noinline(const T* a, const T* b, size_t sz) noexcept { - return squared_euclidean_distance_unrolled(a, b, sz); -} - inline double -squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) noexcept { +squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) { constexpr size_t LOOP_COUNT = 0x100; double sum(0); size_t i = 0; for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) { - sum += squared_euclidean_distance_unrolled_noinline(a + i, b + i, LOOP_COUNT); + sum += squaredEuclideanDistanceT(a + i, b + i, LOOP_COUNT); } if (sz > i) [[unlikely]] { - sum += squared_euclidean_distance_unrolled_noinline(a + i, b + i, sz - i); + sum += squaredEuclideanDistanceT(a + i, b + i, sz - i); } return sum; }