diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml new file mode 100644 index 000000000..55e2978cc --- /dev/null +++ b/.github/workflows/arm.yml @@ -0,0 +1,120 @@ +name: ARM +'on': + push: + branches: + - main + - ci-sandbox + pull_request: + branches: + - '**' +env: + ccache_basedir: ${{ github.workspace }} + ccache_dir: "${{ github.workspace }}/.ccache" + ccache_compilercheck: content + ccache_compress: 'true' + ccache_compresslevel: 9 + ccache_maxsize: 200M + ccache_cmake: -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache + ndk: "${{ github.workspace }}/android-ndk-r23b" + abi: "arm64-v8a" + minsdk_version : 28 + android_platform: 28 + +jobs: + ubuntu: + name: "arm-v8a cross-compile via Android NDK" + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Install prerequisites + run: | + wget -c --quiet https://dl.google.com/android/repository/android-ndk-r23b-linux.zip + unzip -qq android-ndk-r23b-linux.zip + sudo apt-get -y install ccache cmake + + - name: Generate ccache_vars for ccache based on machine + shell: bash + id: ccache_vars + run: |- + echo "::set-output name=hash::$(echo ${{ env.ccache_compilercheck }})" + echo "::set-output name=timestamp::$(date '+%Y-%m-%dT%H.%M.%S')" + + - name: Cache-op for build-cache through ccache + uses: actions/cache@v2 + with: + path: ${{ env.ccache_dir }} + key: ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}-${{ steps.ccache_vars.outputs.timestamp }} + restore-keys: |- + ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }} + ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }} + ccache-${{ matrix.identifier }} + + - name: ccache environment setup + run: |- + echo "CCACHE_COMPILER_CHECK=${{ env.ccache_compilercheck }}" >> $GITHUB_ENV + echo "CCACHE_BASEDIR=${{ env.ccache_basedir }}" >> $GITHUB_ENV + echo "CCACHE_COMPRESS=${{ env.ccache_compress }}" >> $GITHUB_ENV + echo "CCACHE_COMPRESSLEVEL=${{ env.ccache_compresslevel }}" >> $GITHUB_ENV + echo "CCACHE_DIR=${{ env.ccache_dir }}" >> $GITHUB_ENV + echo "CCACHE_MAXSIZE=${{ env.ccache_maxsize }}" >> $GITHUB_ENV + + - name: ccache prolog + run: |- + ccache -s # Print current cache stats + ccache -z # Zero cache entry + + - name: Generate buildfiles for marian on android via cmake + run: |- + mkdir -p build + cd build + NDK=${{ env.ndk }} + ABI=${{ env.abi }} + MINSDK_VERSION=${{ env.minsdk_version }} + ANDROID_PLATFORM=${{ env.android_platform }} + OTHER_ANDROID_ARGS=( + -DANDROID_ARM_NEON=TRUE + ) + OTHER_MARIAN_ARGS=( + -DCOMPILE_CUDA=off + -DCOMPILE_CPU=on + -DCMAKE_HAVE_THREADS_LIBRARY=1 + -DCMAKE_USE_WIN32_THREADS_INIT=0 + -DCMAKE_USE_PTHREADS_INIT=1 + -DTHREADS_PREFER_PTHREAD_FLAG=ON + -DBUILD_ARCH=armv8-a + # -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see. + ) + # Additionally list variables finally configured. + cmake -L \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_TOOLCHAIN=clang \ + -DANDROID_ABI=$ABI \ + -DANDROID_PLATFORM=$ANDROID_PLATFORM \ + -DANDROID_NATIVE_API_LEVEL=$MINSDKVERSION \ + -DANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.8 \ + -DANDROID_STL=c++_static \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + "${OTHER_ANDROID_ARGS[@]}" "${OTHER_MARIAN_ARGS[@]}" \ + .. + + + - name : Build marian for android + working-directory: build + run: |- + # Only build marian (lib) for now. + make -j2 + + - name: ccache epilog + run: 'ccache -s # Print current cache stats' + + - uses: actions/upload-artifact@v2 + with: + path: ${{github.workspace}}/build/marian-decoder + + diff --git a/.gitmodules b/.gitmodules index a8facd1fd..536fa3829 100644 --- a/.gitmodules +++ b/.gitmodules @@ -23,3 +23,9 @@ [submodule "src/3rd_party/onnxjs"] path = src/3rd_party/onnxjs url = https://github.com/abhi-agg/onnxjs.git +[submodule "src/3rd_party/ruy"] + path = src/3rd_party/ruy + url = https://github.com/google/ruy +[submodule "src/3rd_party/simd_utils"] + path = src/3rd_party/simd_utils + url = https://github.com/JishinMaster/simd_utils/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 16986ce4b..efa096f55 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,19 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) include(CMakeDependentOption) +# Architecture detection +include(TargetArch) + +target_architecture(CMAKE_TARGET_ARCHITECTURES) +list(LENGTH CMAKE_TARGET_ARCHITECTURES cmake_target_arch_len) +if(NOT "${cmake_target_arch_len}" STREQUAL "1") + set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL TRUE) + set(CMAKE_TARGET_ARCHITECTURE_CODE "universal") +else() + set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL FALSE) + set(CMAKE_TARGET_ARCHITECTURE_CODE "${CMAKE_TARGET_ARCHITECTURES}") +endif() + # Custom CMake options option(COMPILE_CPU "Compile CPU version" ON) option(COMPILE_CUDA "Compile GPU version" ON) @@ -31,24 +44,58 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF) option(USE_CUDNN "Use CUDNN library" OFF) option(USE_DOXYGEN "Build documentation with Doxygen" ON) option(USE_FBGEMM "Use FBGEMM" OFF) +option(USE_INTGEMM "Use INTGEMM" OFF) +option(USE_RUY "Use Ruy" OFF) option(USE_MKL "Compile with MKL support" ON) option(USE_MPI "Use MPI library" OFF) option(USE_NCCL "Use NCCL library" ON) option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON) option(USE_STATIC_LIBS "Link statically against non-system libs" OFF) option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF) -option(M32_BINARIES "Generate 32bit binaries even when building outside of WASM. Useful for testing some WASM specific functionality without the need for the compiling to WASM." OFF) option(COMPILE_WASM "Compile (wasm compatible) marian for WASM target" OFF) option(USE_WASM_COMPATIBLE_SOURCE "Enable the minimal marian sources that compile to wasm. Useful for debugging wasm failures by building same sources natively" OFF) +option(USE_SIMD_UTILS "Enable simde to target instruction sets" OFF) +option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF) +option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF) + # cmake options that are dependent on USE_WASM_COMPATIBLE_SOURCE cmake option CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF "USE_WASM_COMPATIBLE_SOURCE" ON) -CMAKE_DEPENDENT_OPTION(USE_WASM_COMPATIBLE_BLAS "Compile with wasm compatible blas" ON +CMAKE_DEPENDENT_OPTION(USE_ONNX_SGEMM "Compile with wasm compatible blas" ON "USE_WASM_COMPATIBLE_SOURCE" OFF) CMAKE_DEPENDENT_OPTION(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" ON "USE_WASM_COMPATIBLE_SOURCE" OFF) +if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") + set(USE_RUY ON) + + # Apple M1 has Apple Accelerate(?). + if(NOT APPLE) + set(USE_RUY_SGEMM ON) + endif(NOT APPLE) + + set(USE_SIMD_UTILS ON) +else() + set(USE_INTGEMM ON) +endif() + +if(USE_INTGEMM) + add_compile_definitions(USE_INTGEMM=1) +endif(USE_INTGEMM) + +if(USE_SIMD_UTILS) + if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") + add_compile_definitions(ARM FMA SSE) #added for ARM + endif() + if(MSVC) + add_compile_options(/flax-vector-conversions) + else(MSVC) + add_compile_options(-flax-vector-conversions) + endif(MSVC) +endif(USE_SIMD_UTILS) + + if (USE_WASM_COMPATIBLE_SOURCE) set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)") add_compile_definitions(WASM_COMPATIBLE_SOURCE) @@ -61,10 +108,11 @@ if (COMPILE_WASM) set(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160") endif() -if(M32_BINARIES OR COMPILE_WASM) + +if(COMPILE_WASM) set("BUILD_WIDTH" "-m32") -else(M32_BINARIES OR COMPILE_WASM) - set("BUILD_WIDTH" "-m64") +else(COMPILE_WASM) + set("BUILD_WIDTH" "") endif() if(NOT COMPILE_WASM) @@ -194,7 +242,6 @@ if(MSVC) add_definitions(-DUSE_FBGEMM=1 -DFBGEMM_STATIC=1) endif(USE_FBGEMM) else(MSVC) - # Check we are using at least g++ 5.0 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) message(FATAL_ERROR "FATAL ERROR: Compiling Marian requires at least g++ 5.0, your version is ${CMAKE_CXX_COMPILER_VERSION}") @@ -249,12 +296,14 @@ else(MSVC) # -msse4.1 once marian can solely be compiled with intgemm ("onnxjs" will be removed in that case) set(INTRINSICS "-mssse3 -msimd128") else() - set(INTRINSICS "-msse4.1") + if(CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64) + set(INTRINSICS "-msse4.1") + endif () endif() if(USE_FBGEMM) set(EXT_LIBS ${EXT_LIBS} fbgemm dl) - add_definitions(-DUSE_FBGEMM=1) + add_compile_definitions(USE_FBGEMM=1) endif(USE_FBGEMM) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0) @@ -324,6 +373,7 @@ else(MSVC) endif(COMPILE_WASM) endif(MSVC) + # with gcc 7.0 and above we need to mark fallthrough in switch case statements # that can be done in comments for backcompat, but CCACHE removes comments. # -C makes gcc keep comments. @@ -544,16 +594,22 @@ endif(USE_MPI) ############################################################################### # Find BLAS library for CPU compilation if(COMPILE_CPU) - set(EXT_LIBS ${EXT_LIBS} intgemm) # Move the intgemm bits on top since they compile with every single variant + if(USE_INTGEMM) + set(EXT_LIBS ${EXT_LIBS} intgemm) # Move the intgemm bits on top since they compile with every single variant + endif(USE_INTGEMM) + + if(USE_RUY OR USE_RUY_SGEMM) + set(EXT_LIBS ${EXT_LIBS} ruy) + endif(USE_RUY OR USE_RUY_SGEMM) + add_definitions(-DCOMPILE_CPU=1) # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU # in case a BLAS vendor is not found, we have a runtime error, although we should probably not allow the compilation to go on # if there are BLAS vendors, we have other runtime checks with sane error messages. - if(USE_WASM_COMPATIBLE_BLAS) + if(USE_ONNX_SGEMM) ## Use a wasm compatible BLAS + ## ^ SGEMM != BLAS set(EXT_LIBS ${EXT_LIBS} onnx-sgemm) - set(BLAS_FOUND TRUE) - set(BLAS_VENDOR "ONNX-SGEMM") - add_definitions(-DBLAS_FOUND=1 -DWASM_COMPATIBLE_BLAS=1) # Might be required in some cmake files further down the line, let's avoid using add_compile_definitions in this codeblock + add_definitions(-DUSE_ONNX_SGEMM=1) # Might be required in some cmake files further down the line, let's avoid using add_compile_definitions in this codeblock elseif(APPLE AND USE_APPLE_ACCELERATE) set(BLAS_VENDOR "Accelerate") # see https://developer.apple.com/documentation/accelerate for more info @@ -561,7 +617,9 @@ if(COMPILE_CPU) include_directories("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers") set(EXT_LIBS ${EXT_LIBS} "-framework Accelerate") add_definitions(-DBLAS_FOUND=1) - else(USE_WASM_COMPATIBLE_BLAS) + elseif(USE_RUY_SGEMM) + add_compile_definitions(USE_RUY_SGEMM=1) + else(USE_ONNX_SGEMM) if(USE_MKL) find_package(MKL) endif(USE_MKL) @@ -582,7 +640,8 @@ if(COMPILE_CPU) endif(CBLAS_FOUND) endif(BLAS_FOUND) endif(MKL_FOUND) - endif(USE_WASM_COMPATIBLE_BLAS) + endif(USE_ONNX_SGEMM) + endif(COMPILE_CPU) ############################################################################### diff --git a/cmake/TargetArch.cmake b/cmake/TargetArch.cmake new file mode 100644 index 000000000..f653e3e28 --- /dev/null +++ b/cmake/TargetArch.cmake @@ -0,0 +1,142 @@ +# Modified from https://github.com/axr/solar-cmake/blob/73cfea0db0284c5e2010aca23989046e5bda95c9/Solar.cmake +# Based on the Qt 5 processor detection code, so should be very accurate +# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h +# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64) + +# Regarding POWER/PowerPC, just as is noted in the Qt source, +# "There are many more known variants/revisions that we do not handle/detect." + +set(archdetect_c_code " +#if defined(__arm__) || defined(__TARGET_ARCH_ARM) || defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__) + #if defined(__ARM_ARCH_8__) || defined(__ARM_ARCH_8) \\ + || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A) \\ + || defined(__ARM_ARCH_8R__) || defined(__ARM_ARCH_8R) \\ + || defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8) + #error cmake_ARCH armv8 + #elif defined(__ARM_ARCH_7__) \\ + || defined(__ARM_ARCH_7A__) \\ + || defined(__ARM_ARCH_7R__) \\ + || defined(__ARM_ARCH_7M__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7) + #error cmake_ARCH armv7 + #elif defined(__ARM_ARCH_6__) \\ + || defined(__ARM_ARCH_6J__) \\ + || defined(__ARM_ARCH_6T2__) \\ + || defined(__ARM_ARCH_6Z__) \\ + || defined(__ARM_ARCH_6K__) \\ + || defined(__ARM_ARCH_6ZK__) \\ + || defined(__ARM_ARCH_6M__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6) + #error cmake_ARCH armv6 + #elif defined(__ARM_ARCH_5TEJ__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5) + #error cmake_ARCH armv5 + #else + #error cmake_ARCH arm + #endif +#elif defined(__i386) || defined(__i386__) || defined(_M_IX86) + #error cmake_ARCH i386 +#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64) + #error cmake_ARCH x86_64 +#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64) + #error cmake_ARCH ia64 +#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\ + || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\ + || defined(_M_MPPC) || defined(_M_PPC) + #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__) + #error cmake_ARCH ppc64 + #else + #error cmake_ARCH ppc + #endif +#endif + +#error cmake_ARCH unknown +") + + +# Set ppc_support to TRUE before including this file or ppc and ppc64 +# will be treated as invalid architectures since they are no longer supported by Apple + +function(target_architecture output_var) + if(APPLE AND CMAKE_OSX_ARCHITECTURES) + # On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set + # First let's normalize the order of the values + + # Note that it's not possible to compile PowerPC applications if you are using + # the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we + # disable it by default + # See this page for more information: + # http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4 + + # Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime. + # On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise. + + foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES}) + if("${osx_arch}" STREQUAL "ppc" AND ppc_support) + set(osx_arch_ppc TRUE) + elseif("${osx_arch}" STREQUAL "i386") + set(osx_arch_i386 TRUE) + elseif("${osx_arch}" STREQUAL "x86_64") + set(osx_arch_x86_64 TRUE) + elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support) + set(osx_arch_ppc64 TRUE) + else() + message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}") + endif() + endforeach() + + # Now add all the architectures in our normalized order + if(osx_arch_ppc) + list(APPEND ARCH ppc) + endif() + + if(osx_arch_i386) + list(APPEND ARCH i386) + endif() + + if(osx_arch_x86_64) + list(APPEND ARCH x86_64) + endif() + + if(osx_arch_ppc64) + list(APPEND ARCH ppc64) + endif() + else() + file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}") + + enable_language(C) + + # Detect the architecture in a rather creative way... + # This compiles a small C program which is a series of ifdefs that selects a + # particular #error preprocessor directive whose message string contains the + # target architecture. The program will always fail to compile (both because + # file is not a valid C program, and obviously because of the presence of the + # #error preprocessor directives... but by exploiting the preprocessor in this + # way, we can detect the correct target architecture even when cross-compiling, + # since the program itself never needs to be run (only the compiler/preprocessor) + try_run( + run_result_unused + compile_result_unused + "${CMAKE_BINARY_DIR}" + "${CMAKE_BINARY_DIR}/arch.c" + COMPILE_OUTPUT_VARIABLE ARCH + CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} + ) + + # Parse the architecture name from the compiler output + string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}") + + # Get rid of the value marker leaving just the architecture name + string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}") + + # If we are compiling with an unknown architecture this variable should + # already be set to "unknown" but in the case that it's empty (i.e. due + # to a typo in the code), then set it to unknown + if (NOT ARCH) + set(ARCH unknown) + endif() + endif() + + set(${output_var} "${ARCH}" PARENT_SCOPE) +endfunction() diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index f335c218a..f2062c381 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -5,17 +5,31 @@ add_subdirectory(./yaml-cpp) if(NOT USE_WASM_COMPATIBLE_SOURCE) add_subdirectory(./SQLiteCpp) add_subdirectory(./zlib) + add_subdirectory(./faiss) include_directories(./faiss) endif() -add_subdirectory(./pathie-cpp) -set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests") -add_subdirectory(./intgemm) +add_subdirectory(./pathie-cpp) -if(USE_WASM_COMPATIBLE_BLAS) +if(USE_INTGEMM) + set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests") + add_subdirectory(./intgemm) +endif(USE_INTGEMM) + +if(USE_RUY) + set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_PKG_CONFIG OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_TOOLS OFF CACHE BOOL " " FORCE) + add_subdirectory(ruy/third_party/cpuinfo EXCLUDE_FROM_ALL) + add_subdirectory(ruy EXCLUDE_FROM_ALL) +endif(USE_RUY) + +if(USE_ONNX_SGEMM) add_subdirectory(./onnxjs) -endif(USE_WASM_COMPATIBLE_BLAS) +endif(USE_ONNX_SGEMM) if(USE_FBGEMM) # @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS diff --git a/src/3rd_party/faiss/VectorTransform.cpp b/src/3rd_party/faiss/VectorTransform.cpp index 103b0910e..a26c2b4d3 100644 --- a/src/3rd_party/faiss/VectorTransform.cpp +++ b/src/3rd_party/faiss/VectorTransform.cpp @@ -132,7 +132,18 @@ const float *fvecs_maybe_subsample( return x_subset; } -#if 1 // def __SSE__ +float fvec_norm_L2sqr_ref(const float *x, size_t d) +{ + size_t i; + double res = 0; + for (i = 0; i < d; i++) + res += x[i] * x[i]; + return res; +} + + + +#ifdef __SSE__ // reads 0 <= d < 4 floats as __m128 static inline __m128 masked_read(int d, const float *x) { diff --git a/src/3rd_party/ruy b/src/3rd_party/ruy new file mode 160000 index 000000000..2d950b3bf --- /dev/null +++ b/src/3rd_party/ruy @@ -0,0 +1 @@ +Subproject commit 2d950b3bfa7ebfbe7a97ecb44b1cc4da5ac1d6f0 diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 3ffdc0065..7f669c3f8 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 3ffdc0065a03cadd9d0e5e123aaf9b6ea7ffb05d +Subproject commit 7f669c3f8f5fcc288838f3beba88a04533824f73 diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils new file mode 160000 index 000000000..696036258 --- /dev/null +++ b/src/3rd_party/simd_utils @@ -0,0 +1 @@ +Subproject commit 6960362584481c977cdae9f6a8f7061a37c766cb diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9123e2324..76aa0e2b3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,6 +7,7 @@ include_directories(3rd_party/sentencepiece) include_directories(3rd_party/sentencepiece/third_party/protobuf-lite) include_directories(3rd_party/fbgemm/include) include_directories(3rd_party/intgemm) +include_directories(3rd_party/ruy) include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party/intgemm) include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party) include_directories(${CMAKE_BINARY_DIR}/local/include) @@ -99,12 +100,15 @@ set(MARIAN_SOURCES $ ) -if (NOT USE_WASM_COMPATIBLE_SOURCE) +if (NOT USE_WASM_COMPATIBLE_SOURCE AND NOT ANDROID) list(APPEND MARIAN_SOURCES 3rd_party/ExceptionWithCallStack.cpp + ) +endif() +if (NOT USE_WASM_COMPATIBLE_SOURCE) + list(APPEND MARIAN_SOURCES data/corpus_sqlite.cpp - tensors/cpu/fbgemm/packed_gemm.cpp layers/lsh.cpp optimizers/quantizer.cpp @@ -122,6 +126,11 @@ if (NOT USE_WASM_COMPATIBLE_SOURCE) $ $ ) + if(USE_FBGEMM) + list(APPEND MARIAN_SOURCES + tensors/cpu/fbgemm/packed_gemm.cpp + ) + endif(USE_FBGEMM) endif() add_library(marian STATIC ${MARIAN_SOURCES}) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 9e90bbb59..3bcc6518e 100755 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -15,16 +15,6 @@ #include #include -#if MKL_FOUND -#include -#elif BLAS_FOUND - #if WASM_COMPATIBLE_BLAS - #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" - #else - #include - #endif // WASM_COMPATIBLE_BLAS -#endif - namespace marian { // TODO: Move this to CLIWrapper and allow to mark options as paths in the same place they are diff --git a/src/common/logging.cpp b/src/common/logging.cpp index 999b97b42..c21cf4207 100644 --- a/src/common/logging.cpp +++ b/src/common/logging.cpp @@ -128,7 +128,7 @@ static void setErrorHandlers() { std::set_terminate(unhandledException); #ifdef __unix__ // catch segfaults - struct sigaction sa = { {0} }; + struct sigaction sa = {}; sigemptyset(&sa.sa_mask); sa.sa_flags = SA_SIGINFO; sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); }; @@ -149,8 +149,8 @@ void switchtoMultinodeLogging(std::string nodeIdStr) { namespace marian { std::string noinline getCallStack(size_t skipLevels) { - #ifdef WASM_COMPATIBLE_SOURCE - return "Callstacks not supported in WASM builds currently"; + #if defined(WASM_COMPATIBLE_SOURCE) || defined(__ANDROID__) + return "Callstacks not supported in WASM or Android builds currently"; #else return ::Microsoft::MSR::CNTK::DebugUtil::GetCallStack(skipLevels + 2, /*makeFunctionNamesStandOut=*/true); #endif diff --git a/src/common/types.h b/src/common/types.h index 575e77120..6052896fc 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -17,7 +17,17 @@ #include #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code + +#ifdef __AVX__ #include +#elif __SSE__ +#include +#endif + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#endif + #endif #ifdef __CUDACC__ // nvcc is compiling this code @@ -161,6 +171,43 @@ struct intgemm8 { #ifndef __CUDACC__ // vectorized types not available from .cu files + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +// The following struct fills this structure for ARM with NEON SIMD, changing +// __m128 and _mm_set1_ps with the equivalents on ARM-NEON. +struct float32x4 { +private: + // NEON uses 128-bit SIMD registers, same as SSE. We are copying this class + // and locally aliasing __m128 to float32x4_t, which is the NEON + // equivalent. + using __m128 = float32x4_t; + __m128 f_; + +public: + float32x4() {} + float32x4(const __m128& f) : f_(f) {} + // __m128 _mm_set1_ps(float) copies value into all slots, vdupq_n_f32 is it's + // NEON equivalent. + float32x4(const float& f) : f_(vdupq_n_f32(f)) {} + + operator const __m128&() const { return f_; } + operator __m128&() { return f_; } + + float operator[] (size_t i) const { + return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats + } + + friend std::ostream& operator<<(std::ostream& out, float32x4 f4) { + float* a = (float*)&f4; + out << "[" << a[0]; + for(int i = 1; i < 4; i++) + out << " " << a[i]; + out << "]"; + return out; + } +}; + +#else // @TODO: check what intrinsics are actually available. struct float32x4 { private: @@ -188,6 +235,8 @@ struct float32x4 { } }; +#endif + // @TODO: consider how code can be shared via templating #ifdef __AVX__ struct float32x8 { diff --git a/src/functional/operators.h b/src/functional/operators.h index d79ac3c05..1a67e22a1 100755 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -213,7 +213,15 @@ struct Ops { // __CUDA_ARCH__ is defined when compiling device (GPU) code #ifndef __CUDACC__ +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "3rd_party/simd_utils/simd_utils.h" +#pragma GCC diagnostic pop +#else #include "3rd_party/sse_mathfun.h" +#endif + namespace marian { namespace functional { diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp index a91778ed5..19b040ab9 100644 --- a/src/layers/lsh.cpp +++ b/src/layers/lsh.cpp @@ -3,7 +3,7 @@ #include "tensors/cpu/prod_blas.h" #if BLAS_FOUND -#include "3rd_party/faiss/IndexLSH.h" +#include "faiss/IndexLSH.h" #endif namespace marian { @@ -127,4 +127,4 @@ Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) { } #endif -} // namespace marian \ No newline at end of file +} // namespace marian diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h index 583a2f792..a571cbc80 100644 --- a/src/tensors/cpu/backend.h +++ b/src/tensors/cpu/backend.h @@ -78,14 +78,27 @@ class Backend : public marian::Backend { void setInt8(bool optimize) override { int8_ = optimize; } bool isInt8() override { return int8_; } - void setShifted(bool shifted) override { shifted_ = shifted; } + void setShifted(bool shifted) override { +#if (defined(__arm__) || defined(__aarch64__)) + LOG(info, "gemm-precision: *shifted* is not available on ARM; Setting to false."); + shifted_ = false; +#else + shifted_ = shifted; +#endif + } bool isShifted() override { return shifted_; } void setShiftedAll(bool shiftedAll) override { +#if (defined(__arm__) || defined(__aarch64__)) + LOG(info, "gemm-precision: *shifted* is not available on ARM; Setting to false."); + shiftedAll_ = false; + shifted_ = false; +#else shiftedAll_ = shiftedAll; if (shiftedAll_) { shifted_ = true; } +#endif } bool isShiftedAll() override { diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h index d93719d8e..4af120e4d 100644 --- a/src/tensors/cpu/expression_graph_packable.h +++ b/src/tensors/cpu/expression_graph_packable.h @@ -209,7 +209,7 @@ class ExpressionGraphPackable : public ExpressionGraph { if (gemmElementType == Type::intgemm8) { #if defined(WASM) ABORT("Int8::PrepareA is not implemented for wasm."); -#else +#elif defined(USE_INTGEMM) float quantMult = 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements()); intgemm::Int8::PrepareA(tmp->data(), /*input*/ paramMat->data(), /*output*/ @@ -218,11 +218,13 @@ class ExpressionGraphPackable : public ExpressionGraph { cols(val)); //Put the quantMult at the back of the tensor *(reinterpret_cast(paramMat->data() + val->shape().elements())) = quantMult; +#else + ABORT("Int8::PrepareA not implemented yet for ruy"); #endif } else { #if defined(WASM) ABORT("Int16::PrepareA is not implemented for wasm."); -#else +#elif defined(USE_INTGEMM) float quantMult = 1024.0f; intgemm::Int16::PrepareA(tmp->data(), /*input*/ paramMat->data(), /*output*/ @@ -231,6 +233,8 @@ class ExpressionGraphPackable : public ExpressionGraph { cols(val)); //Put the quantMult at the back of the tensor *(reinterpret_cast(paramMat->data() + val->shape().elements())) = quantMult; +#else + ABORT("Int16::PrepareA is not implemented for wasm."); #endif } diff --git a/src/tensors/cpu/integer_common.cpp b/src/tensors/cpu/integer_common.cpp index 21e7254fa..edaeb112c 100644 --- a/src/tensors/cpu/integer_common.cpp +++ b/src/tensors/cpu/integer_common.cpp @@ -1,5 +1,14 @@ #include "integer_common.h" +#ifdef __SSE__ +#include +#include +#include +#include +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#endif + namespace marian { namespace cpu { namespace integer { @@ -14,7 +23,9 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) { for(int j = 0; j < m; ++j) { int i = 0; + #ifdef __AVX512F__ + // Multiples of 16 add together. int n16 = n & ~15; for(; i < n16; i += 16) { __m512 ai = _mm512_loadu_ps(x + j * n + i); @@ -22,7 +33,8 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) { __m512 yi = _mm512_add_ps(ai, bi); _mm512_storeu_ps(y + j * n + i, yi); } -#else +#elif __SSE__ + // Multiples of 4 add together. int n4 = (n / 4) * 4; for(; i < n4; i += 4) { __m128 ai = _mm_loadu_ps(x + j * n + i); @@ -30,6 +42,21 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) { __m128 yi = _mm_add_ps(ai, bi); _mm_storeu_ps(y + j * n + i, yi); } +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) + int n4 = (n / 4) * 4; + using __m128 = float32x4_t; + for(; i < n4; i += 4) { + __m128 ai = vld1q_f32(x + j * n + i); + __m128 bi = vld1q_f32(bias + i); + __m128 yi = vaddq_f32(ai, bi); + vst1q_f32(y + j * n + i, yi); + } + +#else + // StandardCPP No SIMD case. + for(i = 0; i < n; i++) { + y[j * n + i] = x[j * n + i] + bias[i]; + } #endif for(; i < n; i++) { y[j * n + i] = x[j * n + i] + bias[i]; @@ -39,4 +66,4 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) { } //integer } //cpu -} //marian \ No newline at end of file +} //marian diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h index 97ca79c12..d05440ef1 100644 --- a/src/tensors/cpu/integer_common.h +++ b/src/tensors/cpu/integer_common.h @@ -4,15 +4,20 @@ #include "tensors/tensor_operators.h" #include "tensors/cpu/aligned.h" #include "common/io_item.h" +#ifdef USE_INTGEMM #include "3rd_party/intgemm/intgemm/intgemm.h" +#else // USE_INTGEMM +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcomment" +#include +#pragma GCC diagnostic pop + +#include "ruy_adapter.h" +#endif // USE_INTGEMM #if defined(WASM) #include "wasm_intgemm_interface.h" #endif -#include -#include -#include -#include #include #include @@ -27,6 +32,11 @@ inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tenso inline int cols(Shape& shape) { return shape[-1]; } inline int rows(Shape& shape) { return shape.elements() / cols(shape); } +// This operates on floats after processing so doesn't care about int8_t vs int16_t. +void AddBias(marian::Tensor C, const marian::Tensor Bias); + +#ifdef USE_INTGEMM + template struct intgemm_; template <> struct intgemm_ {using width = intgemm::Int8; using type = int8_t; @@ -35,8 +45,19 @@ template <> struct intgemm_ {using width = intgemm::Int16; using type = int16_t; constexpr static const Type intgemmType = Type::intgemm16;}; -// This operates on floats after processing so doesn't care about int8_t vs int16_t. -void AddBias(marian::Tensor C, const marian::Tensor Bias); + + +#else // USE_INTGEMM + +template struct intgemm_; +template <> struct intgemm_ {using width = IntgemmViaRuy::Int8; + using type = IntgemmViaRuy::Int8::Type; + constexpr static const Type intgemmType = Type::intgemm8;}; +template <> struct intgemm_ {using width = IntgemmViaRuy::Int16; + using type = IntgemmViaRuy::Int16::Type; + constexpr static const Type intgemmType = Type::intgemm16;}; + +#endif // USE_INTGEMM // For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed // in our binary format. Then we copy the quantizationMultiplier information at the end @@ -86,7 +107,7 @@ void prepareAndTransposeB(io::Item& item, const char * input) { //Copy the quantMult float quantMult = *(reinterpret_cast(reinterpret_cast(input) + item.shape.elements())); *(reinterpret_cast(&(*(output_tensor + item.shape.elements())))) = quantMult; - #else + #else // COMPILE_CPU ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on"); #endif } @@ -106,4 +127,4 @@ void unquantizeWemb(io::Item& item, const char * input) { } //integer } //cpu -} //marian \ No newline at end of file +} //marian diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h index 4af960cd6..865c97d3f 100644 --- a/src/tensors/cpu/intgemm_interface.h +++ b/src/tensors/cpu/intgemm_interface.h @@ -42,7 +42,7 @@ bool shifted_; rows(child(0)->val()), cols(child(0)->val()), val_->data() /*output*/); - #else + #elif defined(USE_INTGEMM) typedef typename intgemm_::type Integer; auto input = child(0)->val(); if (!shifted_) { @@ -58,6 +58,14 @@ bool shifted_; rows(child(0)->val()), cols(child(0)->val())); } + #else + // Copied from above. No shifted in ARM. + typedef typename intgemm_::type Integer; + intgemm_::width::PrepareA(child(0)->val()->data(), /*input*/ + val_->data(), /*output*/ + *child(1)->val()->data(), /*Quant Mult*/ + rows(child(0)->val()), + cols(child(0)->val())); #endif }}; #else @@ -259,8 +267,8 @@ struct QuantMultNodeOp : public UnaryNodeOp { #pragma warning(push) #pragma warning(disable: 4127) //VSCODE thinks line 222 is constant conditional expression, which it is only after the template resolution, not before. NodeOps forwardOps() override { -#ifdef COMPILE_CPU - return {NodeOp( + return {[=](){ + #ifdef COMPILE_CPU if (vtype == Type::int16) { *val_->data() = 1024.0f; } else if (child(0)->type() == "intgemmSelectColumnsB") { @@ -270,18 +278,22 @@ struct QuantMultNodeOp : public UnaryNodeOp { *val_->data() = *(reinterpret_cast(reinterpret_cast(child(0)->val()->data()) + child(0)->val()->shape().elements())); } else { if (child(0)->graph()->getBackend()->DumpQuantMult()) { + #if defined(USE_INTGEMM) intgemm::MeanStd meanstd = intgemm::GetVectorMeanStd(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements(), true); intgemm::MeanStd meanstd2 = intgemm::GetVectorMeanStd(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements()); std::cerr << "Name: " << name() << " MeanAbs: " << meanstd.mean << " stddevAbs: " << meanstd.stddev << " Mean: " << meanstd2.mean << " stddev: " << meanstd2.stddev << " MaxAbs: " << intgemm::MaxAbsolute(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements()) << std::endl; + #endif } auto input = child(0)->val(); - *val_->data() = 127.0f / intgemm::MaxAbsolute(input->data(), input->data() + input->size()); + #if defined(USE_INTGEMM) + *val_->data() = 127.0f / intgemm::MaxAbsolute(input->data(), input->data() + input->size()); + #else + *val_->data() = 127.0f / IntgemmViaRuy::MaxAbsolute(input->data(), input->data() + input->size()); + #endif } - )}; -#else - return {NodeOp()}; -#endif + #endif // COMPILE_CPU + }}; } #pragma warning(pop) NodeOps backwardOps() override { @@ -347,9 +359,11 @@ class PrepareBiasForBNodeOp : public NaryNodeOp { float scale_a = *quant_mult_a->data(); float scale_b = *quant_mult_b->data(); int8PrepareBias((const int8_t *)b->data(), scale_a, 0.0 /*zero_point_a*/, scale_b, 0.0 /*zero_point_b*/, rows(b), cols(b), bias->data(), val_->data()); - #else + #elif defined(USE_INTGEMM) float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias->data(), val_->data())); + #else + ABORT("PrepareBias should not be called on ARM"); #endif } }}; @@ -384,9 +398,11 @@ class PrepareFakeBiasForBNodeOp : public NaryNodeOp { float scale_a = *quant_mult_a->data(); float scale_b = *quant_mult_b->data(); int8PrepareBias((const int8_t *)b->data(), scale_a, 0.0 /*zero_point_a*/, scale_b, 0.0 /*zero_point_b*/, rows(b), cols(b), nullptr/*input_bias*/, val_->data()); - #else + #elif defined(USE_INTGEMM) float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data())); + #else + // Not sure what's going on here. #endif }}; #else @@ -435,7 +451,7 @@ float scalar_; "Int16::Multiply is not implemented for wasm."); ABORT_IF(intgemm_::intgemmType == Type::intgemm8, "Int8::Multiply is not implemented for wasm."); - #else + #elif defined(USE_INTGEMM) typedef typename intgemm_::type Integer; intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ reinterpret_cast(child(1)->val()->data()), /*B*/ @@ -443,6 +459,17 @@ float scalar_; cols(child(0)->val()), cols(child(1)->val()), intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data())); + #else + typedef typename intgemm_::type Integer; + auto callback = marian::cpu::integer::UnquantizeAndWrite(unquant_mult); + intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ + reinterpret_cast(child(1)->val()->data()), /*B*/ + val_->data(), /*output*/ + rows(child(0)->val()), + cols(child(0)->val()), + cols(child(1)->val()), + callback); + #endif }}; #else @@ -509,7 +536,7 @@ class AffineNodeOp : public NaryNodeOp { cols(child(0)->val()), cols(child(1)->val()), val_->data()); - #else + #elif defined(USE_INTGEMM) typedef typename intgemm_::type Integer; if (!shifted_) { intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ @@ -526,6 +553,19 @@ class AffineNodeOp : public NaryNodeOp { cols(child(1)->val()), /*child(2) is bias*/ intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, child(2)->val()->data(), val_->data())); } + #else + typedef typename intgemm_::type Integer; + auto callback = marian::cpu::integer::UnquantizeAndAddBiasAndWrite(unquant_mult, + child(2)->val()->data() /*child(2) is bias*/); + intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ + reinterpret_cast(child(1)->val()->data()), /*B*/ + val_->data(), /*output*/ + rows(child(0)->val()), + cols(child(0)->val()), + cols(child(1)->val()), + callback); + + #endif }}; #else diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 4d761cf4b..8cc030539 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -10,11 +10,9 @@ #if MKL_FOUND #include #elif BLAS_FOUND - #if WASM_COMPATIBLE_BLAS - #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" - #else - #include - #endif // WASM_COMPATIBLE_BLAS + #include +#elif USE_ONNX_SGEMM + #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" #endif #include "integer_common.h" @@ -79,7 +77,6 @@ void ProdBatchedOld(marian::Tensor C, bool transB, float beta, float scalar) { -#if BLAS_FOUND float alpha = scalar; size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]); @@ -183,10 +180,6 @@ void ProdBatchedOld(marian::Tensor C, (int)ldc); } #endif -#else - C; A; B; transA; transB; beta; scalar; - ABORT("You need to compile with MKL in order to use the CPU version"); -#endif } void ProdBatched(marian::Tensor C, diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h index 1d6757927..61053b402 100644 --- a/src/tensors/cpu/prod_blas.h +++ b/src/tensors/cpu/prod_blas.h @@ -1,13 +1,126 @@ +#pragma once #if MKL_FOUND -#include + #include #elif BLAS_FOUND - #if WASM_COMPATIBLE_BLAS - #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" - #else #include - #endif // WASM_COMPATIBLE_BLAS +#elif USE_ONNX_SGEMM + #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" +#elif USE_RUY_SGEMM +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcomment" + #include "ruy/ruy.h" + #include "ruy/system_aligned_alloc.h" +#pragma GCC pop #endif +#if USE_RUY_SGEMM + +// AlignedVector allocates aligned memory and cleans up after itself. RAII +// wrapper similar to intgemm's AlignedVector. +template +class AlignedVector { +public: + AlignedVector(size_t num_elem) + : size_(num_elem), + storage_(reinterpret_cast(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {} + + T *begin() { return storage_; } + T *data() { return storage_; } + size_t size() const { return size_; } + size_t memSize() const { return sizeof(T) * size_; } + + // Forbid copy + AlignedVector(const AlignedVector &) = delete; + AlignedVector &operator=(const AlignedVector &) = delete; + + ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast(storage_)); } + +private: + size_t size_; + T *storage_; +}; + + +inline void GemmRuy(const bool transA, + const bool transB, + const int M, + const int N, + const int K, + const float alpha, + const float *A, + const int lda, + const float *B, + const int ldb, + const float beta, + float *C, + const int ldc) { + ruy::Context context; + + // If we need to transpose, we can swap dimensions in layout claim the matrix + // is just column-major. Set ordering so transpose. + const auto orderA = (transA ? ruy::Order::kColMajor : ruy::Order::kRowMajor); + const auto orderB = (transB ? ruy::Order::kColMajor : ruy::Order::kRowMajor); + + ruy::Matrix lhs; + ruy::MakeSimpleLayout(M, K, orderA, lhs.mutable_layout()); + lhs.set_data(A); + + ruy::Matrix rhs; + ruy::MakeSimpleLayout(K, N, orderB, rhs.mutable_layout()); + rhs.set_data(B); + + ruy::Matrix dst; + ruy::MakeSimpleLayout(M, N, ruy::Order::kRowMajor, dst.mutable_layout()); + + if(beta == 0) { + // For beta = 0, we want to avoid the additional allocation. This is a + // large amount of our inference use-cases. sgemm is called with `beta` for + // accumulating gradients in backpropogation, which is 0.0 during + // inference. + + dst.set_data(C); + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + if(alpha != 1.0) { + // Write out C as C = alpha * [op(A) * op(B)] + beta * C + // Can we expect the compiler to autovectorize this? + // TODO: Come back and explicitly use SIMD. + const size_t size = M * N; + const float *opA_opB = C; // Alias. + for(size_t i = 0; i < size; i++) { + C[i] = alpha * opA_opB[i]; + } + } + + } else { + // @jerinphilip has not yet been able to find a ruy primitive that does in + // place addition to obtain full gemm. + // + // Safe bet is to make an additional allocation to store the result of + // multiply and use the existing values in C. + // + // See also: https://github.com/google/ruy/issues/307 + + AlignedVector intermediate(M * N); + dst.set_data(intermediate.data()); + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + // Write out C as C = alpha * [op(A) * op(B)] + beta * C + // Can we expect the compiler to autovectorize this? + // TODO: Come back and explicitly use SIMD. + const size_t size = M * N; + const float *opA_opB = intermediate.data(); + for(size_t i = 0; i < size; i++) { + C[i] = alpha * opA_opB[i] + beta * C[i]; + } + } +} + +#endif // RUY_SGEMM + + inline void sgemm(bool transA, bool transB, int rows_a, @@ -22,9 +135,6 @@ inline void sgemm(bool transA, float* c, int ldc) { #if BLAS_FOUND - #if WASM_COMPATIBLE_BLAS - gemm_f32_imp(transA, transB, rows_a, rows_b, width, alpha, a, b, beta, c); - #else cblas_sgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans, @@ -39,8 +149,23 @@ inline void sgemm(bool transA, beta, c, ldc); - #endif -#else +#elif USE_ONNX_SGEMM + gemm_f32_imp(transA, transB, rows_a, rows_b, width, alpha, a, b, beta, c); +#elif USE_RUY_SGEMM + GemmRuy(transA, + transB, + rows_a, + rows_b, + width, + alpha, + a, + lda, + b, + ldb, + beta, + c, + ldc); +#else transA; transB; rows_a; rows_b; width; alpha; a; lda; b; ldb; beta; c; ldc; // make compiler happy ABORT("Marian must be compiled with a BLAS library"); #endif diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h new file mode 100644 index 000000000..ad6bc1ce6 --- /dev/null +++ b/src/tensors/cpu/ruy_adapter.h @@ -0,0 +1,377 @@ +/* + * This file follows intgemm and is a means of retrofitting ruy into the intgemm based wiring in + * `intgemm_interface.h`. ruy is an inference backend used in tensorflow and android deployment and + * has an optimized ARM backend for the multiply operations required. Optimized code for quantize, + * unquantize, transpose are added separately to connect the multiply library to marian. + */ + +#pragma once +#include +#include +#include +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcomment" +#include "ruy/platform.h" +#include "ruy/system_aligned_alloc.h" +#pragma GCC diagnostic pop + +#if RUY_PLATFORM_NEON +#include +#endif + +namespace marian { +namespace cpu { +namespace integer { + +using Index = unsigned int; + +#if RUY_PLATFORM_NEON + +/* + * Optimized path using ARM NEON SIMD intrinsics. Currently only supports int8_t. + */ +inline void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) { + const float32x4_t *Input = reinterpret_cast(input); + const float32x4_t *InputEnd = reinterpret_cast(input + rows * width); + + int8x8_t *Output = reinterpret_cast(output); + while(Input != InputEnd) { + // Vector multiply by scalar + // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); + // VMUL.F32 q0,q0,d0[0] + float32x4_t scaledFloat_lo = vmulq_n_f32(*Input++, scale); + + // Convert from float + // int32x4_t vcvtnq_s32_f32(float32x4_t a); + // VCVT.S32.F32 q0, q0 + int32x4_t scaledInt_lo = vcvtnq_s32_f32(scaledFloat_lo); + + // Vector saturating narrow integer + // int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 + int16x4_t s16x4_lo = vqmovn_s32(scaledInt_lo); + + // Vector multiply by scalar + // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); + // VMUL.F32 q0,q0,d0[0] + float32x4_t scaledFloat_hi = vmulq_n_f32(*Input++, scale); + + // Convert from float + // int32x4_t vcvtnq_s32_f32(float32x4_t a); + // VCVT.S32.F32 q0, q0 + int32x4_t scaledInt_hi = vcvtnq_s32_f32(scaledFloat_hi); + + // Vector saturating narrow integer + // int16x4_t vqmovn_s32(int32x4_t a); + // VQMOVN.S32 d0,q0 + int16x4_t s16x4_hi = vqmovn_s32(scaledInt_hi); + + // Combine two ints. + // int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); + int16x8_t s16x8 = vcombine_s16(s16x4_lo, s16x4_hi); + + // Vector saturating narrow integer + int8x8_t s8x8 = vqmovn_s16(s16x8); + + *Output = s8x8; + ++Output; + }; +} + +inline void _transpose_16x16(const int8_t *src, + Index i, + Index j, + Index rows, + Index cols, + int8_t *dst) { + // Implemented following the algorithm described in + // https://stackoverflow.com/a/29587984/4565794 + // + // permute n 32-bit rows + // permute n 64-bit rows + // ... + // permute n simd_width/2-bit rows + + // clang-format off + + // Permute 8 8-bit rows. + // Load int8x16x2 from memory into SIMD registers, transpose as 2x2 matrices. + + Index srcRowBegin = i*cols + j; + int8x16x2_t r0 = vtrnq_s8(vld1q_s8(&src[ 0*cols + srcRowBegin]), vld1q_s8(&src[ 1*cols + srcRowBegin])); + int8x16x2_t r1 = vtrnq_s8(vld1q_s8(&src[ 2*cols + srcRowBegin]), vld1q_s8(&src[ 3*cols + srcRowBegin])); + int8x16x2_t r2 = vtrnq_s8(vld1q_s8(&src[ 4*cols + srcRowBegin]), vld1q_s8(&src[ 5*cols + srcRowBegin])); + int8x16x2_t r3 = vtrnq_s8(vld1q_s8(&src[ 6*cols + srcRowBegin]), vld1q_s8(&src[ 7*cols + srcRowBegin])); + int8x16x2_t r4 = vtrnq_s8(vld1q_s8(&src[ 8*cols + srcRowBegin]), vld1q_s8(&src[ 9*cols + srcRowBegin])); + int8x16x2_t r5 = vtrnq_s8(vld1q_s8(&src[10*cols + srcRowBegin]), vld1q_s8(&src[11*cols + srcRowBegin])); + int8x16x2_t r6 = vtrnq_s8(vld1q_s8(&src[12*cols + srcRowBegin]), vld1q_s8(&src[13*cols + srcRowBegin])); + int8x16x2_t r7 = vtrnq_s8(vld1q_s8(&src[14*cols + srcRowBegin]), vld1q_s8(&src[15*cols + srcRowBegin])); + + + // Permute 8 16-bit rows. + // Next step is to treat the entries as int16x8x2 (via cast) and do + // transpose for int16, which will now leave intra-2 pairs intact while + // transposing inter 2-pairs into the right places. + int16x8x2_t t0 = vtrnq_s16(vreinterpretq_s16_s8(r0.val[0]), vreinterpretq_s16_s8(r1.val[0])); + int16x8x2_t t1 = vtrnq_s16(vreinterpretq_s16_s8(r2.val[0]), vreinterpretq_s16_s8(r3.val[0])); + int16x8x2_t t2 = vtrnq_s16(vreinterpretq_s16_s8(r4.val[0]), vreinterpretq_s16_s8(r5.val[0])); + int16x8x2_t t3 = vtrnq_s16(vreinterpretq_s16_s8(r6.val[0]), vreinterpretq_s16_s8(r7.val[0])); + int16x8x2_t t4 = vtrnq_s16(vreinterpretq_s16_s8(r0.val[1]), vreinterpretq_s16_s8(r1.val[1])); + int16x8x2_t t5 = vtrnq_s16(vreinterpretq_s16_s8(r2.val[1]), vreinterpretq_s16_s8(r3.val[1])); + int16x8x2_t t6 = vtrnq_s16(vreinterpretq_s16_s8(r4.val[1]), vreinterpretq_s16_s8(r5.val[1])); + int16x8x2_t t7 = vtrnq_s16(vreinterpretq_s16_s8(r6.val[1]), vreinterpretq_s16_s8(r7.val[1])); + + // Permute 8 32-bit rows. + int32x4x2_t x0 = vtrnq_s32(vreinterpretq_s32_s16(t0.val[0]), vreinterpretq_s32_s16(t1.val[0])); + int32x4x2_t x1 = vtrnq_s32(vreinterpretq_s32_s16(t4.val[0]), vreinterpretq_s32_s16(t5.val[0])); + int32x4x2_t x2 = vtrnq_s32(vreinterpretq_s32_s16(t0.val[1]), vreinterpretq_s32_s16(t1.val[1])); + int32x4x2_t x3 = vtrnq_s32(vreinterpretq_s32_s16(t4.val[1]), vreinterpretq_s32_s16(t5.val[1])); + + int32x4x2_t x4 = vtrnq_s32(vreinterpretq_s32_s16(t2.val[0]), vreinterpretq_s32_s16(t3.val[0])); + int32x4x2_t x5 = vtrnq_s32(vreinterpretq_s32_s16(t6.val[0]), vreinterpretq_s32_s16(t7.val[0])); + int32x4x2_t x6 = vtrnq_s32(vreinterpretq_s32_s16(t2.val[1]), vreinterpretq_s32_s16(t3.val[1])); + int32x4x2_t x7 = vtrnq_s32(vreinterpretq_s32_s16(t6.val[1]), vreinterpretq_s32_s16(t7.val[1])); + + // There is no permute 8 64-bit rows available. + // Instead we follow extracting low and high and placing them into the right places. + Index dstRowBegin = j*rows + i; + vst1q_s8(&dst[ 0*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x0.val[0]), vget_low_s32(x4.val[0])))); + vst1q_s8(&dst[ 1*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x1.val[0]), vget_low_s32(x5.val[0])))); + vst1q_s8(&dst[ 2*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x2.val[0]), vget_low_s32(x6.val[0])))); + vst1q_s8(&dst[ 3*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x3.val[0]), vget_low_s32(x7.val[0])))); + vst1q_s8(&dst[ 4*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x0.val[1]), vget_low_s32(x4.val[1])))); + vst1q_s8(&dst[ 5*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x1.val[1]), vget_low_s32(x5.val[1])))); + vst1q_s8(&dst[ 6*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x2.val[1]), vget_low_s32(x6.val[1])))); + vst1q_s8(&dst[ 7*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x3.val[1]), vget_low_s32(x7.val[1])))); + + vst1q_s8(&dst[ 8*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x0.val[0]), vget_high_s32(x4.val[0])))); + vst1q_s8(&dst[ 9*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x1.val[0]), vget_high_s32(x5.val[0])))); + vst1q_s8(&dst[10*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x2.val[0]), vget_high_s32(x6.val[0])))); + vst1q_s8(&dst[11*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x3.val[0]), vget_high_s32(x7.val[0])))); + vst1q_s8(&dst[12*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x0.val[1]), vget_high_s32(x4.val[1])))); + vst1q_s8(&dst[13*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x1.val[1]), vget_high_s32(x5.val[1])))); + vst1q_s8(&dst[14*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x2.val[1]), vget_high_s32(x6.val[1])))); + vst1q_s8(&dst[15*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x3.val[1]), vget_high_s32(x7.val[1])))); + + // clang-format on +} + +// Specialization for int8_t +inline void transpose(const int8_t *input, Index rows, Index cols, int8_t *output) { + constexpr Index tile_size = 16; + // TODO(jerin): Enable + // assert(rows % tile_size == 0 && cols & tile_size == 0); + for(Index i = 0; i < rows; i += tile_size) { + for(Index j = 0; j < cols; j += tile_size) { + _transpose_16x16(input, i, j, rows, cols, output); + } + } +} + +struct UnquantizeAndAddBiasAndWrite { + UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared) + : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {} + + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { + // Set all registers in lane from same scalar value. + float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_); + const int32x4_t *Input = reinterpret_cast(input); + const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); + float32x4_t *Output = reinterpret_cast(output); + + while(Input != InputEnd) { + // Bias cycles every column for addition. + const float32x4_t *Bias = reinterpret_cast(input_bias_prepared_); + + // InputEnd needs to be determined to end the while loop below. + const int32x4_t *RowEnd + = reinterpret_cast(reinterpret_cast(Input) + cols_B); + + while(Input != RowEnd) { + // Operation happening for 4-elements together: + // output = [int32_t]input * [float]quant_mult + [float]bias; + float32x4_t floatInput = vcvtq_f32_s32(*Input++); + float32x4_t unquantized = vmulq_f32(floatInput, multiplier); + *Output++ = vaddq_f32(unquantized, *Bias++); + } + } + } + +private: + float unquant_multiplier_; + const float *input_bias_prepared_; +}; + +struct UnquantizeAndWrite { + explicit UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {} + + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { + // Set all registers in lane from same scalar value. + float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_); + const int32x4_t *Input = reinterpret_cast(input); + const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); + float32x4_t *Output = reinterpret_cast(output); + + while(Input != InputEnd) { + // InputEnd needs to be determined to end the while loop below. + const int32x4_t *RowEnd + = reinterpret_cast(reinterpret_cast(Input) + cols_B); + + while(Input != RowEnd) { + // Operation happening for 4-elements together: + // output = [int32_t]input * [float]quant_mult + [float]bias; + float32x4_t floatInput = vcvtq_f32_s32(*Input++); + float32x4_t unquantized = vmulq_f32(floatInput, multiplier); + *Output++ = unquantized; + } + } + } + +private: + float unquant_multiplier_; +}; + +#endif + +/* + * The following nomenclature comes from intgemm. The current state of code is to keep the + * intgemm_interface.h diff minimal. There are possibly better abstractions. + */ +struct IntgemmViaRuy { + // Intgemm nomenclature expects Int8. Missing functions are ABORTs. + struct Int8 { + using Type = int8_t; + static void PrepareBQuantizedTransposed(const Type *input, + Type *output, + Index rows, + Index cols) { + std::memcpy(output, input, /*count=*/sizeof(Type) * (rows * cols)); + } + + static void PrepareBTransposed(const float *input, + Type *output, + float quant_mult, + Index rows, + Index cols) { + quantize(input, output, quant_mult, rows, cols); + } + + static void PrepareB(const float *, Type *, float, Index, Index) { + ABORT("PrepareB Unsupported"); + } + + static void PrepareA(const float *input, + int8_t *output, + float quant_mult, + Index rows, + Index cols) { + quantize(input, output, quant_mult, rows, cols); + } + + static void SelectColumnsB(const Type *input, + Type *output, + Index width, + const Index *cols, + const Index *cols_end) { + // B_prepared is expected to be col-major, for our implementation via ruy. If + // col-major we can memcpy the respective column entries as they're + // sequential. There are width=rows entries. + Index num_cols = static_cast(std::distance(cols, cols_end)); + for(Index c = 0; c < num_cols; ++c) { + std::memcpy(&(output[c * width]), &(input[cols[c] * width]), width); + } + } + + // We don't have callback an no-op capability here yet. Multiply is kept similar to Mozilla + // specification and there are overloads with and without bias to avoid an if inside. This + // method corresponds to the one with bias. + // output = A*B + bias + template + static void Multiply(const Type *input_A_prepared, + const Type *input_B_prepared, + float *output, + Index rows_A, + Index width, + Index cols_B, + Callback callback) { + // It is expected that somehow we have managed to call all prepare by the time + // we are here, with inputs (prepared) in int8_t. All that's left to do is use + // ruy for multiply and then start with the reverse ops to get to fp32. + + // Use ruy to multiply. + // The following is adapted from + // https://github.com/google/ruy/blob/878283640de7946a43053e8ebf4f15114fbc9156/example/example.cc#L129-L152 + + ruy::Context context; + ruy::Matrix lhs; + ruy::MakeSimpleLayout(rows_A, width, ruy::Order::kRowMajor, lhs.mutable_layout()); + lhs.set_data(input_A_prepared); + + ruy::Matrix rhs; + ruy::MakeSimpleLayout(width, cols_B, ruy::Order::kColMajor, rhs.mutable_layout()); + rhs.set_data(input_B_prepared); + + ruy::Matrix dst; + ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout()); + + std::int32_t *dest_ptr = reinterpret_cast(output); + dst.set_data(dest_ptr); + + // When Dst is int32, mul_params is unused. + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + callback(dest_ptr, rows_A, cols_B, output); + } + }; + + // Int16 support is currently missing. + struct Int16 { + using Type = int16_t; + static void Quantize(const float *, Type *, float, Index) { ABORT("Quantize unsupported"); } + + static void PrepareA(const float *input, + Type *output, + float quant_mult, + Index rows, + Index cols) { + ABORT("PrepareA Unsupported"); + } + + static void PrepareB(const float *, Type *, float, Index, Index) { + ABORT("PrepareB Unsupported"); + } + static void PrepareBQuantizedTransposed(const Type *, Type *, Index, Index) { + ABORT("PrepareBQuantizedTransposed Unsupported"); + } + static void PrepareBTransposed(const float *, Type *, float, Index, Index) { + ABORT("PrepareBTransposed Unsupported"); + } + static void SelectColumnsB(const Type *, Type *, Index, const Index *, const Index *) { + ABORT("SelectColumnsB Unsupported"); + } + + template + static void Multiply(const Type *A_prepared, + const Type *B_prepared, + float *output, + Index rows_A, + Index width, + Index cols_B, + Callback callback) { + ABORT("Multiply (A*B) Unsupported"); + } + }; + + template + static T MaxAbsolute(const T *begin, const T *end) { + T result = 0; + for(auto p = begin; p < end; ++p) { + result = std::max(result, std::abs(*p)); + } + return result; + } +}; + +} // namespace integer +} // namespace cpu +} // namespace marian