Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify unrolled euclidean distance computation functions and unroll for int8_t #32194

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 2 additions & 28 deletions vespalib/src/vespa/vespalib/hwaccelerated/generic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,32 +32,6 @@ multiplyAdd(const T * a, const T * b, size_t sz) noexcept
return sum;
}

template <typename T, size_t UNROLL>
double
squaredEuclideanDistanceT(const T * a, const T * b, size_t sz) noexcept
{
T partial[UNROLL];
for (size_t i(0); i < UNROLL; i++) {
partial[i] = 0;
}
size_t i(0);
for (; i + UNROLL <= sz; i += UNROLL) {
for (size_t j(0); j < UNROLL; j++) {
T d = a[i+j] - b[i+j];
partial[j] += d * d;
}
}
for (;i < sz; i++) {
T d = a[i] - b[i];
partial[i%UNROLL] += d * d;
}
double sum(0);
for (size_t j(0); j < UNROLL; j++) {
sum += partial[j];
}
return sum;
}

template<size_t UNROLL, typename Operation>
void
bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes) noexcept {
Expand Down Expand Up @@ -169,12 +143,12 @@ GenericAccelrator::squaredEuclideanDistance(const int8_t * a, const int8_t * b,

double
GenericAccelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept {
return squaredEuclideanDistanceT<float, 16>(a, b, sz);
return helper::squared_euclidean_distance_unrolled<float, 16>(a, b, sz);
}

double
GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept {
return squaredEuclideanDistanceT<double, 16>(a, b, sz);
return helper::squared_euclidean_distance_unrolled<double, 16>(a, b, sz);
}

void
Expand Down
45 changes: 32 additions & 13 deletions vespalib/src/vespa/vespalib/hwaccelerated/private_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,30 +84,49 @@ orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> &src, v
}
}

template<typename TemporaryT=int32_t>
double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) __attribute__((noinline));

template<typename TemporaryT>
double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) {
//Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float
TemporaryT sum = 0;
for (size_t i(0); i < sz; i++) {
int16_t d = int16_t(a[i]) - int16_t(b[i]);
sum += d * d;
template <typename T, size_t UNROLL, typename PartialT = T, typename ConvertElementT = T>
inline double squared_euclidean_distance_unrolled(const T* a, const T* b, size_t sz) noexcept {
// Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float
PartialT partial[UNROLL];
for (size_t i = 0; i < UNROLL; ++i) {
partial[i] = 0;
}
size_t i = 0;
for (; i + UNROLL <= sz; i += UNROLL) {
for (size_t j = 0; j < UNROLL; ++j) {
ConvertElementT d = ConvertElementT(a[i+j]) - ConvertElementT(b[i+j]);
partial[j] += d * d;
}
}
for (; i < sz; ++i) {
ConvertElementT d = ConvertElementT(a[i]) - ConvertElementT(b[i]);
partial[i % UNROLL] += d * d;
}
double sum = 0;
for (size_t j = 0; j < UNROLL; ++j) {
sum += partial[j];
}
return sum;
}

template <typename T, size_t UNROLL, typename PartialT = T, typename ConvertElementT = T>
double squared_euclidean_distance_unrolled_noinline(const T* a, const T* b, size_t sz) noexcept __attribute__((noinline));

template <typename T, size_t UNROLL, typename PartialT, typename ConvertElementT>
double squared_euclidean_distance_unrolled_noinline(const T* a, const T* b, size_t sz) noexcept {
return squared_euclidean_distance_unrolled<T, UNROLL, PartialT, ConvertElementT>(a, b, sz);
}

inline double
squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) {
squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) noexcept {
constexpr size_t LOOP_COUNT = 0x100;
double sum(0);
size_t i = 0;
for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) {
sum += squaredEuclideanDistanceT<int32_t>(a + i, b + i, LOOP_COUNT);
sum += squared_euclidean_distance_unrolled_noinline<int8_t, 2, int32_t, int16_t>(a + i, b + i, LOOP_COUNT);
}
if (sz > i) [[unlikely]] {
sum += squaredEuclideanDistanceT<int32_t>(a + i, b + i, sz - i);
sum += squared_euclidean_distance_unrolled_noinline<int8_t, 2, int32_t, int16_t>(a + i, b + i, sz - i);
}
return sum;
}
Expand Down