Enable SVE Support for L2 Metric Computation in FP32

kind/feature Signed-off-by:Adarsh Srivastava <[email protected]> Signed-off-by: Adarsh Srivastava <[email protected]>
zilliztech · Nov 30, 2024 · 97b3ee0 · 97b3ee0
1 parent 1cb9937
commit 97b3ee0
Show file tree

Hide file tree

Showing 6 changed files with 334 additions and 31 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -224,7 +224,43 @@ if(WITH_FAISS_TESTS)
   add_subdirectory(tests/faiss)
 endif()
 
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+
+    # Find ARM SVE headers
+    find_path(ARM_SVE_DIR arm_sve.h PATHS
+        /usr/lib/gcc/aarch64-linux-gnu/*/include
+        /usr/lib/llvm-*/lib/clang/*/include
+        /usr/include
+        /usr/local/include
+        NO_DEFAULT_PATH
+    )
+    if(ARM_SVE_DIR)
+        include_directories(SYSTEM ${ARM_SVE_DIR})
+        message(STATUS "ARM SVE headers found at: ${ARM_SVE_DIR}")
+    else()
+        message(WARNING "ARM SVE headers not found!")
+    endif()
+
+    # Find ARM NEON headers
+    find_path(ARM_NEON_DIR arm_neon.h PATHS
+        /usr/lib/gcc/aarch64-linux-gnu/*/include
+        /usr/lib/llvm-*/lib/clang/*/include
+        /usr/include
+        /usr/local/include
+        NO_DEFAULT_PATH
+    )
+    if(ARM_NEON_DIR)
+        include_directories(SYSTEM ${ARM_NEON_DIR})
+        message(STATUS "ARM NEON headers found at: ${ARM_NEON_DIR}")
+    else()
+        message(WARNING "ARM NEON headers not found!")
+    endif()
+
+endif()
+
 install(TARGETS knowhere
         DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
 install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/knowhere"
         DESTINATION "${CMAKE_INSTALL_PREFIX}/include")
+
+
diff --git a/cmake/libs/libfaiss.cmake b/cmake/libs/libfaiss.cmake
@@ -47,9 +47,18 @@ if(__X86_64)
 endif()
 
 if(__AARCH64)
-  set(UTILS_SRC src/simd/hook.cc src/simd/distances_ref.cc
-                src/simd/distances_neon.cc)
-  add_library(knowhere_utils STATIC ${UTILS_SRC})
+  set(UTILS_SRC src/simd/hook.cc src/simd/distances_ref.cc)
+
+  # Add separate utils for NEON and SVE
+  add_library(utils_neon OBJECT src/simd/distances_neon.cc)
+  add_library(utils_sve OBJECT src/simd/distances_sve.cc)
+
+  target_compile_options(utils_neon PRIVATE -march=armv8-a+simd)
+  target_compile_options(utils_sve PRIVATE -march=armv8-a+sve)
+
+  add_library(
+    knowhere_utils STATIC
+    ${UTILS_SRC} $<TARGET_OBJECTS:utils_neon> $<TARGET_OBJECTS:utils_sve>)
   target_link_libraries(knowhere_utils PUBLIC glog::glog)
 endif()
 
@@ -75,6 +84,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aar
 else()
     find_package(BLAS REQUIRED)
 endif()
+
 if(__X86_64)
   list(REMOVE_ITEM FAISS_SRCS ${FAISS_AVX2_SRCS})
 

diff --git a/src/simd/distances_sve.cc b/src/simd/distances_sve.cc
@@ -0,0 +1,177 @@
+// Copyright (C) 2019-2023 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License.
+
+
+#include "distances_sve.h"
+#include <arm_sve.h>
+#include <cmath>
+#include "faiss/impl/platform_macros.h"
+#pragma GCC optimize("O3,fast-math,inline")
+#if defined(__ARM_FEATURE_SVE)
+namespace faiss {
+
+float fvec_L2sqr_sve(const float* x, const float* y, size_t d) {
+    svfloat32_t sum = svdup_f32(0.0f);
+    size_t i = 0;
+
+    svbool_t pg = svptrue_b32();
+
+    while (i < d) {
+        if (d - i < svcntw())
+            pg = svwhilelt_b32(i, d);
+
+        svfloat32_t a = svld1_f32(pg, x + i);
+        svfloat32_t b = svld1_f32(pg, y + i);
+        svfloat32_t diff = svsub_f32_m(pg, a, b);
+        sum = svmla_f32_m(pg, sum, diff, diff);
+        i += svcntw();
+    }
+
+    return svaddv_f32(svptrue_b32(), sum);
+}
+
+float fvec_L1_sve(const float* x, const float* y, size_t d) {
+    svfloat32_t sum = svdup_f32(0.0f);
+    size_t i = 0;
+
+    svbool_t pg = svptrue_b32();
+
+    while (i < d) {
+        if (d - i < svcntw())
+            pg = svwhilelt_b32(i, d);
+
+        svfloat32_t a = svld1_f32(pg, x + i);
+        svfloat32_t b = svld1_f32(pg, y + i);
+        svfloat32_t diff = svabs_f32_x(pg, svsub_f32_m(pg, a, b));
+        sum = svadd_f32_m(pg, sum, diff);
+        i += svcntw();
+    }
+
+    return svaddv_f32(svptrue_b32(), sum);
+}
+
+float fvec_Linf_sve(const float* x, const float* y, size_t d) {
+    svfloat32_t max_val = svdup_f32(0.0f);
+    size_t i = 0;
+
+    svbool_t pg = svptrue_b32();
+
+    while (i < d) {
+        if (d - i < svcntw())
+            pg = svwhilelt_b32(i, d);
+
+        svfloat32_t a = svld1_f32(pg, x + i);
+        svfloat32_t b = svld1_f32(pg, y + i);
+        svfloat32_t diff = svabs_f32_x(pg, svsub_f32_m(pg, a, b));
+        max_val = svmax_f32_m(pg, max_val, diff);
+        i += svcntw();
+    }
+
+    return svmaxv_f32(svptrue_b32(), max_val);
+}
+
+float fvec_norm_L2sqr_sve(const float* x, size_t d) {
+    svfloat32_t sum = svdup_f32(0.0f);
+    size_t i = 0;
+
+    svbool_t pg = svptrue_b32();
+
+    while (i < d) {
+        if (d - i < svcntw())
+            pg = svwhilelt_b32(i, d);
+
+        svfloat32_t a = svld1_f32(pg, x + i);
+        sum = svmla_f32_m(pg, sum, a, a);
+        i += svcntw();
+    }
+
+    return svaddv_f32(svptrue_b32(), sum);
+}
+
+void fvec_madd_sve(size_t n, const float* a, float bf, const float* b, float* c) {
+    size_t i = 0;
+    svfloat32_t bf_vec = svdup_f32(bf);
+
+    svbool_t pg = svptrue_b32();
+
+    while (i < n) {
+        if (n - i < svcntw())
+            pg = svwhilelt_b32(i, n);
+
+        svfloat32_t a_vec = svld1_f32(pg, a + i);
+        svfloat32_t b_vec = svld1_f32(pg, b + i);
+        svfloat32_t c_vec = svmla_f32_m(pg, a_vec, b_vec, bf_vec);
+        svst1_f32(pg, c + i, c_vec);
+        i += svcntw();
+    }
+}
+
+int fvec_madd_and_argmin_sve(size_t n, const float* a, float bf, const float* b, float* c) {
+    size_t i = 0;
+    svfloat32_t min_val = svdup_f32(INFINITY);
+    svuint32_t min_idx = svdup_u32(0);
+    svuint32_t idx_base = svindex_u32(0, 1);
+
+    svfloat32_t bf_vec = svdup_f32(bf);
+    svbool_t pg = svptrue_b32();
+
+    while (i < n) {
+        if (n - i < svcntw())
+            pg = svwhilelt_b32(i, n);
+
+        svuint32_t idx = svadd_u32_z(pg, idx_base, svdup_u32(i));
+        svfloat32_t a_vec = svld1_f32(pg, a + i);
+        svfloat32_t b_vec = svld1_f32(pg, b + i);
+        svfloat32_t c_vec = svmla_f32_m(pg, a_vec, b_vec, bf_vec);
+        svst1_f32(pg, c + i, c_vec);
+
+        svbool_t cmp = svcmplt(pg, c_vec, min_val);
+        min_val = svsel_f32(cmp, c_vec, min_val);
+        min_idx = svsel_u32(cmp, idx, min_idx);
+
+        i += svcntw();
+    }
+
+    float min_value = svminv_f32(svptrue_b32(), min_val);
+    svbool_t pg_min = svcmpeq(svptrue_b32(), min_val, svdup_f32(min_value));
+    uint32_t min_index = svlastb_u32(pg_min, min_idx);
+
+    return static_cast<int>(min_index);
+}
+
+void fvec_L2sqr_batch_4_sve(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                            const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
+    float d0 = 0;
+    float d1 = 0;
+    float d2 = 0;
+    float d3 = 0;
+
+    for (size_t i = 0; i < d; ++i) {
+        const float q0 = x[i] - y0[i];
+        const float q1 = x[i] - y1[i];
+        const float q2 = x[i] - y2[i];
+        const float q3 = x[i] - y3[i];
+        d0 += q0 * q0;
+        d1 += q1 * q1;
+        d2 += q2 * q2;
+        d3 += q3 * q3;
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+
+
+} // namespace faiss
+
+#endif
diff --git a/src/simd/distances_sve.h b/src/simd/distances_sve.h
@@ -0,0 +1,37 @@
+// Copyright (C) 2019-2023 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License.
+
+
+#include <cstdint>
+#include <cstdio>
+#include <arm_sve.h>
+#if defined(__ARM_FEATURE_SVE)
+namespace faiss {
+
+float fvec_L2sqr_sve(const float* x, const float* y, size_t d);
+
+float fvec_L1_sve(const float* x, const float* y, size_t d);
+
+float fvec_Linf_sve(const float* x, const float* y, size_t d);
+
+float fvec_norm_L2sqr_sve(const float* x, size_t d);
+
+void fvec_madd_sve(size_t n, const float* a, float bf, const float* b, float* c);
+
+int fvec_madd_and_argmin_sve(size_t n, const float* a, float bf, const float* b, float* c);
+
+int32_t ivec_L2sqr_sve(const int8_t* x, const int8_t* y, size_t d);
+
+void fvec_L2sqr_batch_4_sve(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                          const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
+
+} // namespace faiss
+#endif