From 10370b9c1cb333e1bee2e520c044d5008478d83e Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Tue, 20 Aug 2024 09:42:20 +0000 Subject: [PATCH] Unify unrolled euclidean distance computation functions and unroll for `int8_t` Experiments on Mac M1 show the following unrolling results for `int8_t`: 1: 84 ms (baseline) 2: 41 ms 4: 53 ms 8: 82 ms 16: 120 ms I.e. an unrolling factor of 2 is chosen for `int8_t`. It is expected that this will also produce gains on x64 AVX{2,512}, but the concrete unrolling factor has not been fine-tuned for this architecture yet. --- .../vespa/vespalib/hwaccelerated/generic.cpp | 30 +------------ .../hwaccelerated/private_helpers.hpp | 45 +++++++++++++------ 2 files changed, 34 insertions(+), 41 deletions(-) diff --git a/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp index dcc78ea1da1c..28bbb8da2334 100644 --- a/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp @@ -32,32 +32,6 @@ multiplyAdd(const T * a, const T * b, size_t sz) noexcept return sum; } -template -double -squaredEuclideanDistanceT(const T * a, const T * b, size_t sz) noexcept -{ - T partial[UNROLL]; - for (size_t i(0); i < UNROLL; i++) { - partial[i] = 0; - } - size_t i(0); - for (; i + UNROLL <= sz; i += UNROLL) { - for (size_t j(0); j < UNROLL; j++) { - T d = a[i+j] - b[i+j]; - partial[j] += d * d; - } - } - for (;i < sz; i++) { - T d = a[i] - b[i]; - partial[i%UNROLL] += d * d; - } - double sum(0); - for (size_t j(0); j < UNROLL; j++) { - sum += partial[j]; - } - return sum; -} - template void bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes) noexcept { @@ -169,12 +143,12 @@ GenericAccelrator::squaredEuclideanDistance(const int8_t * a, const int8_t * b, double GenericAccelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept { - return squaredEuclideanDistanceT(a, b, sz); + return helper::squared_euclidean_distance_unrolled(a, b, sz); } double GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept { - return squaredEuclideanDistanceT(a, b, sz); + return helper::squared_euclidean_distance_unrolled(a, b, sz); } void diff --git a/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp index f7a6473df476..e1c0ec923eca 100644 --- a/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp @@ -84,30 +84,49 @@ orChunks(size_t offset, const std::vector> &src, v } } -template -double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) __attribute__((noinline)); - -template -double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) { - //Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float - TemporaryT sum = 0; - for (size_t i(0); i < sz; i++) { - int16_t d = int16_t(a[i]) - int16_t(b[i]); - sum += d * d; +template +inline double squared_euclidean_distance_unrolled(const T* a, const T* b, size_t sz) noexcept { + // Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float + PartialT partial[UNROLL]; + for (size_t i = 0; i < UNROLL; ++i) { + partial[i] = 0; + } + size_t i = 0; + for (; i + UNROLL <= sz; i += UNROLL) { + for (size_t j = 0; j < UNROLL; ++j) { + ConvertElementT d = ConvertElementT(a[i+j]) - ConvertElementT(b[i+j]); + partial[j] += d * d; + } + } + for (; i < sz; ++i) { + ConvertElementT d = ConvertElementT(a[i]) - ConvertElementT(b[i]); + partial[i % UNROLL] += d * d; + } + double sum = 0; + for (size_t j = 0; j < UNROLL; ++j) { + sum += partial[j]; } return sum; } +template +double squared_euclidean_distance_unrolled_noinline(const T* a, const T* b, size_t sz) noexcept __attribute__((noinline)); + +template +double squared_euclidean_distance_unrolled_noinline(const T* a, const T* b, size_t sz) noexcept { + return squared_euclidean_distance_unrolled(a, b, sz); +} + inline double -squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) { +squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) noexcept { constexpr size_t LOOP_COUNT = 0x100; double sum(0); size_t i = 0; for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) { - sum += squaredEuclideanDistanceT(a + i, b + i, LOOP_COUNT); + sum += squared_euclidean_distance_unrolled_noinline(a + i, b + i, LOOP_COUNT); } if (sz > i) [[unlikely]] { - sum += squaredEuclideanDistanceT(a + i, b + i, sz - i); + sum += squared_euclidean_distance_unrolled_noinline(a + i, b + i, sz - i); } return sum; }