From 10370b9c1cb333e1bee2e520c044d5008478d83e Mon Sep 17 00:00:00 2001
From: Tor Brede Vekterli <vekterli@vespa.ai>
Date: Tue, 20 Aug 2024 09:42:20 +0000
Subject: [PATCH] Unify unrolled euclidean distance computation functions and
 unroll for `int8_t`

Experiments on Mac M1 show the following unrolling results for `int8_t`:

 1:  84 ms (baseline)
 2:  41 ms
 4:  53 ms
 8:  82 ms
 16: 120 ms

I.e. an unrolling factor of 2 is chosen for `int8_t`. It is expected that
this will also produce gains on x64 AVX{2,512}, but the concrete unrolling
factor has not been fine-tuned for this architecture yet.
---
 .../vespa/vespalib/hwaccelerated/generic.cpp  | 30 +------------
 .../hwaccelerated/private_helpers.hpp         | 45 +++++++++++++------
 2 files changed, 34 insertions(+), 41 deletions(-)
diff --git a/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp
index dcc78ea1da1c..28bbb8da2334 100644
--- a/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp
@@ -32,32 +32,6 @@ multiplyAdd(const T * a, const T * b, size_t sz) noexcept
     return sum;
 }
 
-template <typename T, size_t UNROLL>
-double
-squaredEuclideanDistanceT(const T * a, const T * b, size_t sz) noexcept
-{
-    T partial[UNROLL];
-    for (size_t i(0); i < UNROLL; i++) {
-        partial[i] = 0;
-    }
-    size_t i(0);
-    for (; i + UNROLL <= sz; i += UNROLL) {
-        for (size_t j(0); j < UNROLL; j++) {
-            T d = a[i+j] - b[i+j];
-            partial[j] += d * d;
-        }
-    }
-    for (;i < sz; i++) {
-        T d = a[i] - b[i];
-        partial[i%UNROLL] += d * d;
-    }
-    double sum(0);
-    for (size_t j(0); j < UNROLL; j++) {
-        sum += partial[j];
-    }
-    return sum;
-}
-
 template<size_t UNROLL, typename Operation>
 void
 bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes) noexcept {
@@ -169,12 +143,12 @@ GenericAccelrator::squaredEuclideanDistance(const int8_t * a, const int8_t * b,
 
 double
 GenericAccelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept {
-    return squaredEuclideanDistanceT<float, 16>(a, b, sz);
+    return helper::squared_euclidean_distance_unrolled<float, 16>(a, b, sz);
 }
 
 double
 GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept {
-    return squaredEuclideanDistanceT<double, 16>(a, b, sz);
+    return helper::squared_euclidean_distance_unrolled<double, 16>(a, b, sz);
 }
 
 void
diff --git a/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp
index f7a6473df476..e1c0ec923eca 100644
--- a/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp
+++ b/vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp
@@ -84,30 +84,49 @@ orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> &src, v
     }
 }
 
-template<typename TemporaryT=int32_t>
-double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) __attribute__((noinline));
-
-template<typename TemporaryT>
-double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) {
-    //Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float
-    TemporaryT sum = 0;
-    for (size_t i(0); i < sz; i++) {
-        int16_t d = int16_t(a[i]) - int16_t(b[i]);
-        sum += d * d;
+template <typename T, size_t UNROLL, typename PartialT = T, typename ConvertElementT = T>
+inline double squared_euclidean_distance_unrolled(const T* a, const T* b, size_t sz) noexcept {
+    // Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float
+    PartialT partial[UNROLL];
+    for (size_t i = 0; i < UNROLL; ++i) {
+        partial[i] = 0;
+    }
+    size_t i = 0;
+    for (; i + UNROLL <= sz; i += UNROLL) {
+        for (size_t j = 0; j < UNROLL; ++j) {
+            ConvertElementT d = ConvertElementT(a[i+j]) - ConvertElementT(b[i+j]);
+            partial[j] += d * d;
+        }
+    }
+    for (; i < sz; ++i) {
+        ConvertElementT d = ConvertElementT(a[i]) - ConvertElementT(b[i]);
+        partial[i % UNROLL] += d * d;
+    }
+    double sum = 0;
+    for (size_t j = 0; j < UNROLL; ++j) {
+        sum += partial[j];
     }
     return sum;
 }
 
+template <typename T, size_t UNROLL, typename PartialT = T, typename ConvertElementT = T>
+double squared_euclidean_distance_unrolled_noinline(const T* a, const T* b, size_t sz) noexcept __attribute__((noinline));
+
+template <typename T, size_t UNROLL, typename PartialT, typename ConvertElementT>
+double squared_euclidean_distance_unrolled_noinline(const T* a, const T* b, size_t sz) noexcept {
+    return squared_euclidean_distance_unrolled<T, UNROLL, PartialT, ConvertElementT>(a, b, sz);
+}
+
 inline double
-squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) {
+squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) noexcept {
     constexpr size_t LOOP_COUNT = 0x100;
     double sum(0);
     size_t i = 0;
     for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) {
-        sum += squaredEuclideanDistanceT<int32_t>(a + i, b + i, LOOP_COUNT);
+        sum += squared_euclidean_distance_unrolled_noinline<int8_t, 2, int32_t, int16_t>(a + i, b + i, LOOP_COUNT);
     }
     if (sz > i) [[unlikely]] {
-        sum += squaredEuclideanDistanceT<int32_t>(a + i, b + i, sz - i);
+        sum += squared_euclidean_distance_unrolled_noinline<int8_t, 2, int32_t, int16_t>(a + i, b + i, sz - i);
     }
     return sum;
 }