diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml
new file mode 100644
index 000000000..55e2978cc
--- /dev/null
+++ b/.github/workflows/arm.yml
@@ -0,0 +1,120 @@
+name: ARM
+'on':
+  push:
+    branches:
+    - main
+    - ci-sandbox
+  pull_request:
+    branches:
+    - '**'
+env:
+  ccache_basedir: ${{ github.workspace }}
+  ccache_dir: "${{ github.workspace }}/.ccache"
+  ccache_compilercheck: content
+  ccache_compress: 'true'
+  ccache_compresslevel: 9
+  ccache_maxsize: 200M
+  ccache_cmake: -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache
+  ndk: "${{ github.workspace }}/android-ndk-r23b"
+  abi: "arm64-v8a"
+  minsdk_version : 28
+  android_platform: 28
+
+jobs:
+  ubuntu:
+    name: "arm-v8a cross-compile via Android NDK"
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Install prerequisites
+      run: |
+          wget -c --quiet https://dl.google.com/android/repository/android-ndk-r23b-linux.zip
+          unzip -qq android-ndk-r23b-linux.zip
+          sudo apt-get -y install ccache cmake
+
+    - name: Generate ccache_vars for ccache based on machine
+      shell: bash
+      id: ccache_vars
+      run: |-
+        echo "::set-output name=hash::$(echo ${{ env.ccache_compilercheck }})"
+        echo "::set-output name=timestamp::$(date '+%Y-%m-%dT%H.%M.%S')"
+
+    - name: Cache-op for build-cache through ccache
+      uses: actions/cache@v2
+      with:
+        path: ${{ env.ccache_dir }}
+        key: ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}-${{ steps.ccache_vars.outputs.timestamp }}
+        restore-keys: |-
+          ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}
+          ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }}
+          ccache-${{ matrix.identifier }}
+
+    - name: ccache environment setup
+      run: |-
+        echo "CCACHE_COMPILER_CHECK=${{ env.ccache_compilercheck }}" >> $GITHUB_ENV
+        echo "CCACHE_BASEDIR=${{ env.ccache_basedir }}" >> $GITHUB_ENV
+        echo "CCACHE_COMPRESS=${{ env.ccache_compress }}" >> $GITHUB_ENV
+        echo "CCACHE_COMPRESSLEVEL=${{ env.ccache_compresslevel }}" >> $GITHUB_ENV
+        echo "CCACHE_DIR=${{ env.ccache_dir }}" >> $GITHUB_ENV
+        echo "CCACHE_MAXSIZE=${{ env.ccache_maxsize }}" >> $GITHUB_ENV
+
+    - name: ccache prolog
+      run: |-
+        ccache -s # Print current cache stats
+        ccache -z # Zero cache entry
+
+    - name: Generate buildfiles for marian on android via cmake
+      run: |-
+        mkdir -p build 
+        cd build
+        NDK=${{ env.ndk }}
+        ABI=${{ env.abi }}
+        MINSDK_VERSION=${{ env.minsdk_version }}
+        ANDROID_PLATFORM=${{ env.android_platform }}
+        OTHER_ANDROID_ARGS=(
+            -DANDROID_ARM_NEON=TRUE
+        )
+        OTHER_MARIAN_ARGS=(
+            -DCOMPILE_CUDA=off
+            -DCOMPILE_CPU=on
+            -DCMAKE_HAVE_THREADS_LIBRARY=1
+            -DCMAKE_USE_WIN32_THREADS_INIT=0
+            -DCMAKE_USE_PTHREADS_INIT=1
+            -DTHREADS_PREFER_PTHREAD_FLAG=ON
+            -DBUILD_ARCH=armv8-a
+            # -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see.
+        )
+        # Additionally list variables finally configured.
+        cmake -L \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake \
+            -DANDROID_TOOLCHAIN=clang \
+            -DANDROID_ABI=$ABI \
+            -DANDROID_PLATFORM=$ANDROID_PLATFORM \
+            -DANDROID_NATIVE_API_LEVEL=$MINSDKVERSION \
+            -DANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.8 \
+            -DANDROID_STL=c++_static \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            "${OTHER_ANDROID_ARGS[@]}" "${OTHER_MARIAN_ARGS[@]}" \
+            ..
+
+
+    - name : Build marian for android
+      working-directory: build
+      run: |-
+          # Only build marian (lib) for now.
+          make -j2 
+
+    - name: ccache epilog
+      run: 'ccache -s # Print current cache stats'
+
+    - uses: actions/upload-artifact@v2
+      with:
+        path: ${{github.workspace}}/build/marian-decoder
+
+
diff --git a/.gitmodules b/.gitmodules
index a8facd1fd..536fa3829 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -23,3 +23,9 @@
 [submodule "src/3rd_party/onnxjs"]
 	path = src/3rd_party/onnxjs
 	url = https://github.com/abhi-agg/onnxjs.git
+[submodule "src/3rd_party/ruy"]
+	path = src/3rd_party/ruy
+	url = https://github.com/google/ruy
+[submodule "src/3rd_party/simd_utils"]
+	path = src/3rd_party/simd_utils
+	url = https://github.com/JishinMaster/simd_utils/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 16986ce4b..efa096f55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,19 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 include(CMakeDependentOption)
 
+# Architecture detection
+include(TargetArch)
+
+target_architecture(CMAKE_TARGET_ARCHITECTURES)
+list(LENGTH CMAKE_TARGET_ARCHITECTURES cmake_target_arch_len)
+if(NOT "${cmake_target_arch_len}" STREQUAL "1")
+    set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL TRUE)
+    set(CMAKE_TARGET_ARCHITECTURE_CODE "universal")
+else()
+    set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL FALSE)
+    set(CMAKE_TARGET_ARCHITECTURE_CODE "${CMAKE_TARGET_ARCHITECTURES}")
+endif()
+
 # Custom CMake options
 option(COMPILE_CPU "Compile CPU version" ON)
 option(COMPILE_CUDA "Compile GPU version" ON)
@@ -31,24 +44,58 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
 option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_DOXYGEN "Build documentation with Doxygen" ON)
 option(USE_FBGEMM "Use FBGEMM" OFF)
+option(USE_INTGEMM "Use INTGEMM" OFF)
+option(USE_RUY "Use Ruy" OFF)
 option(USE_MKL "Compile with MKL support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
 option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
 option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
 option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
-option(M32_BINARIES "Generate 32bit binaries even when building outside of WASM. Useful for testing some WASM specific functionality without the need for the compiling to WASM." OFF)
 option(COMPILE_WASM "Compile (wasm compatible) marian for WASM target" OFF)
 option(USE_WASM_COMPATIBLE_SOURCE "Enable the minimal marian sources that compile to wasm. Useful for debugging wasm failures by building same sources natively" OFF)
 
+option(USE_SIMD_UTILS "Enable simde to target instruction sets" OFF)
+option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF)
+option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF)
+
 # cmake options that are dependent on USE_WASM_COMPATIBLE_SOURCE cmake option
 CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF
                        "USE_WASM_COMPATIBLE_SOURCE" ON)
-CMAKE_DEPENDENT_OPTION(USE_WASM_COMPATIBLE_BLAS "Compile with wasm compatible blas" ON
+CMAKE_DEPENDENT_OPTION(USE_ONNX_SGEMM "Compile with wasm compatible blas" ON
                        "USE_WASM_COMPATIBLE_SOURCE" OFF)
 CMAKE_DEPENDENT_OPTION(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" ON
                        "USE_WASM_COMPATIBLE_SOURCE" OFF)
 
+if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
+  set(USE_RUY ON)
+
+  # Apple M1 has Apple Accelerate(?).
+  if(NOT APPLE)
+    set(USE_RUY_SGEMM ON)
+  endif(NOT APPLE)
+
+  set(USE_SIMD_UTILS ON)
+else()
+  set(USE_INTGEMM ON)
+endif()
+
+if(USE_INTGEMM)
+  add_compile_definitions(USE_INTGEMM=1)
+endif(USE_INTGEMM)
+
+if(USE_SIMD_UTILS)
+  if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
+    add_compile_definitions(ARM FMA SSE) #added for ARM
+  endif()
+  if(MSVC)
+    add_compile_options(/flax-vector-conversions)
+  else(MSVC)
+    add_compile_options(-flax-vector-conversions)
+  endif(MSVC)
+endif(USE_SIMD_UTILS)
+
+
 if (USE_WASM_COMPATIBLE_SOURCE)
   set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)")
   add_compile_definitions(WASM_COMPATIBLE_SOURCE)
@@ -61,10 +108,11 @@ if (COMPILE_WASM)
   set(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160")
 endif()
 
-if(M32_BINARIES OR COMPILE_WASM)
+
+if(COMPILE_WASM)
     set("BUILD_WIDTH" "-m32")
-else(M32_BINARIES OR COMPILE_WASM)
-    set("BUILD_WIDTH" "-m64")
+else(COMPILE_WASM)
+    set("BUILD_WIDTH" "")
 endif()
 
 if(NOT COMPILE_WASM)
@@ -194,7 +242,6 @@ if(MSVC)
     add_definitions(-DUSE_FBGEMM=1 -DFBGEMM_STATIC=1)
   endif(USE_FBGEMM)
 else(MSVC)
-
   # Check we are using at least g++ 5.0
   if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
     message(FATAL_ERROR "FATAL ERROR: Compiling Marian requires at least g++ 5.0, your version is ${CMAKE_CXX_COMPILER_VERSION}")
@@ -249,12 +296,14 @@ else(MSVC)
     # -msse4.1 once marian can solely be compiled with intgemm ("onnxjs" will be removed in that case)
     set(INTRINSICS "-mssse3 -msimd128")
   else()
-    set(INTRINSICS "-msse4.1")
+    if(CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64)
+      set(INTRINSICS "-msse4.1")
+    endif ()
   endif()
 
   if(USE_FBGEMM)
     set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
-    add_definitions(-DUSE_FBGEMM=1)
+    add_compile_definitions(USE_FBGEMM=1)
   endif(USE_FBGEMM)
 
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
@@ -324,6 +373,7 @@ else(MSVC)
   endif(COMPILE_WASM)
 endif(MSVC)
 
+
 # with gcc 7.0 and above we need to mark fallthrough in switch case statements
 # that can be done in comments for backcompat, but CCACHE removes comments.
 # -C makes gcc keep comments.
@@ -544,16 +594,22 @@ endif(USE_MPI)
 ###############################################################################
 # Find BLAS library for CPU compilation
 if(COMPILE_CPU)
-  set(EXT_LIBS ${EXT_LIBS} intgemm)       # Move the intgemm bits on top since they compile with every single variant
+  if(USE_INTGEMM)
+      set(EXT_LIBS ${EXT_LIBS} intgemm)       # Move the intgemm bits on top since they compile with every single variant
+  endif(USE_INTGEMM)
+
+  if(USE_RUY OR USE_RUY_SGEMM)
+    set(EXT_LIBS ${EXT_LIBS} ruy) 
+  endif(USE_RUY OR USE_RUY_SGEMM)
+
   add_definitions(-DCOMPILE_CPU=1)        # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU
                                           # in case a BLAS vendor is not found, we have a runtime error, although we should probably not allow the compilation to go on
                                           # if there are BLAS vendors, we have other runtime checks with sane error messages.
-  if(USE_WASM_COMPATIBLE_BLAS)
+  if(USE_ONNX_SGEMM)
     ## Use a wasm compatible BLAS
+    ## ^ SGEMM != BLAS
     set(EXT_LIBS ${EXT_LIBS} onnx-sgemm)
-    set(BLAS_FOUND TRUE)
-    set(BLAS_VENDOR "ONNX-SGEMM")
-    add_definitions(-DBLAS_FOUND=1  -DWASM_COMPATIBLE_BLAS=1) # Might be required in some cmake files further down the line, let's avoid using add_compile_definitions in this codeblock
+    add_definitions(-DUSE_ONNX_SGEMM=1) # Might be required in some cmake files further down the line, let's avoid using add_compile_definitions in this codeblock
   elseif(APPLE AND USE_APPLE_ACCELERATE)
     set(BLAS_VENDOR "Accelerate")
     # see https://developer.apple.com/documentation/accelerate for more info
@@ -561,7 +617,9 @@ if(COMPILE_CPU)
     include_directories("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers")
     set(EXT_LIBS ${EXT_LIBS} "-framework Accelerate")
     add_definitions(-DBLAS_FOUND=1)
-  else(USE_WASM_COMPATIBLE_BLAS)
+  elseif(USE_RUY_SGEMM)
+    add_compile_definitions(USE_RUY_SGEMM=1)
+  else(USE_ONNX_SGEMM)
     if(USE_MKL)
       find_package(MKL)
     endif(USE_MKL)
@@ -582,7 +640,8 @@ if(COMPILE_CPU)
         endif(CBLAS_FOUND)
       endif(BLAS_FOUND)
     endif(MKL_FOUND)
-  endif(USE_WASM_COMPATIBLE_BLAS)
+  endif(USE_ONNX_SGEMM)
+
 endif(COMPILE_CPU)
 
 ###############################################################################
diff --git a/cmake/TargetArch.cmake b/cmake/TargetArch.cmake
new file mode 100644
index 000000000..f653e3e28
--- /dev/null
+++ b/cmake/TargetArch.cmake
@@ -0,0 +1,142 @@
+# Modified from https://github.com/axr/solar-cmake/blob/73cfea0db0284c5e2010aca23989046e5bda95c9/Solar.cmake
+# Based on the Qt 5 processor detection code, so should be very accurate
+# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
+# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)
+
+# Regarding POWER/PowerPC, just as is noted in the Qt source,
+# "There are many more known variants/revisions that we do not handle/detect."
+
+set(archdetect_c_code "
+#if defined(__arm__) || defined(__TARGET_ARCH_ARM)  || defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__)
+    #if defined(__ARM_ARCH_8__)     || defined(__ARM_ARCH_8)          \\
+        || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A)         \\
+        || defined(__ARM_ARCH_8R__) || defined(__ARM_ARCH_8R)         \\
+        || defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M)         \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8)
+        #error cmake_ARCH armv8
+    #elif defined(__ARM_ARCH_7__)                                     \\
+        || defined(__ARM_ARCH_7A__)                                   \\
+        || defined(__ARM_ARCH_7R__)                                   \\
+        || defined(__ARM_ARCH_7M__)                                   \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
+        #error cmake_ARCH armv7
+    #elif defined(__ARM_ARCH_6__)                                      \\
+        || defined(__ARM_ARCH_6J__)                                    \\
+        || defined(__ARM_ARCH_6T2__)                                   \\
+        || defined(__ARM_ARCH_6Z__)                                    \\
+        || defined(__ARM_ARCH_6K__)                                    \\
+        || defined(__ARM_ARCH_6ZK__)                                   \\
+        || defined(__ARM_ARCH_6M__)                                    \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
+        #error cmake_ARCH armv6
+    #elif defined(__ARM_ARCH_5TEJ__) \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
+        #error cmake_ARCH armv5
+    #else
+        #error cmake_ARCH arm
+    #endif
+#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
+    #error cmake_ARCH i386
+#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
+    #error cmake_ARCH x86_64
+#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+    #error cmake_ARCH ia64
+#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
+      || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC)  \\
+      || defined(_M_MPPC) || defined(_M_PPC)
+    #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
+        #error cmake_ARCH ppc64
+    #else
+        #error cmake_ARCH ppc
+    #endif
+#endif
+
+#error cmake_ARCH unknown
+")
+
+
+# Set ppc_support to TRUE before including this file or ppc and ppc64
+# will be treated as invalid architectures since they are no longer supported by Apple
+
+function(target_architecture output_var)
+    if(APPLE AND CMAKE_OSX_ARCHITECTURES)
+        # On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
+        # First let's normalize the order of the values
+
+        # Note that it's not possible to compile PowerPC applications if you are using
+        # the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
+        # disable it by default
+        # See this page for more information:
+        # http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4
+
+        # Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
+        # On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.
+
+        foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
+            if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
+                set(osx_arch_ppc TRUE)
+            elseif("${osx_arch}" STREQUAL "i386")
+                set(osx_arch_i386 TRUE)
+            elseif("${osx_arch}" STREQUAL "x86_64")
+                set(osx_arch_x86_64 TRUE)
+            elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
+                set(osx_arch_ppc64 TRUE)
+            else()
+                message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
+            endif()
+        endforeach()
+
+        # Now add all the architectures in our normalized order
+        if(osx_arch_ppc)
+            list(APPEND ARCH ppc)
+        endif()
+
+        if(osx_arch_i386)
+            list(APPEND ARCH i386)
+        endif()
+
+        if(osx_arch_x86_64)
+            list(APPEND ARCH x86_64)
+        endif()
+
+        if(osx_arch_ppc64)
+            list(APPEND ARCH ppc64)
+        endif()
+    else()
+        file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
+
+        enable_language(C)
+
+        # Detect the architecture in a rather creative way...
+        # This compiles a small C program which is a series of ifdefs that selects a
+        # particular #error preprocessor directive whose message string contains the
+        # target architecture. The program will always fail to compile (both because
+        # file is not a valid C program, and obviously because of the presence of the
+        # #error preprocessor directives... but by exploiting the preprocessor in this
+        # way, we can detect the correct target architecture even when cross-compiling,
+        # since the program itself never needs to be run (only the compiler/preprocessor)
+        try_run(
+            run_result_unused
+            compile_result_unused
+            "${CMAKE_BINARY_DIR}"
+            "${CMAKE_BINARY_DIR}/arch.c"
+            COMPILE_OUTPUT_VARIABLE ARCH
+            CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
+        )
+
+        # Parse the architecture name from the compiler output
+        string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
+
+        # Get rid of the value marker leaving just the architecture name
+        string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
+
+        # If we are compiling with an unknown architecture this variable should
+        # already be set to "unknown" but in the case that it's empty (i.e. due
+        # to a typo in the code), then set it to unknown
+        if (NOT ARCH)
+            set(ARCH unknown)
+        endif()
+    endif()
+
+    set(${output_var} "${ARCH}" PARENT_SCOPE)
+endfunction()
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index f335c218a..f2062c381 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -5,17 +5,31 @@ add_subdirectory(./yaml-cpp)
 if(NOT USE_WASM_COMPATIBLE_SOURCE)
   add_subdirectory(./SQLiteCpp)
   add_subdirectory(./zlib)
+
   add_subdirectory(./faiss)
   include_directories(./faiss)
 endif()
-add_subdirectory(./pathie-cpp)
 
-set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
-add_subdirectory(./intgemm)
+add_subdirectory(./pathie-cpp)
 
-if(USE_WASM_COMPATIBLE_BLAS)
+if(USE_INTGEMM)
+  set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
+  add_subdirectory(./intgemm)
+endif(USE_INTGEMM)
+
+if(USE_RUY)
+  set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL " " FORCE)
+  set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL " " FORCE)
+  set(CPUINFO_BUILD_PKG_CONFIG OFF CACHE BOOL " " FORCE)
+  set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL " " FORCE)
+  set(CPUINFO_BUILD_TOOLS      OFF CACHE BOOL " " FORCE)
+  add_subdirectory(ruy/third_party/cpuinfo EXCLUDE_FROM_ALL)
+  add_subdirectory(ruy EXCLUDE_FROM_ALL)
+endif(USE_RUY)
+
+if(USE_ONNX_SGEMM)
   add_subdirectory(./onnxjs)
-endif(USE_WASM_COMPATIBLE_BLAS)
+endif(USE_ONNX_SGEMM)
 
 if(USE_FBGEMM)
   # @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
diff --git a/src/3rd_party/faiss/VectorTransform.cpp b/src/3rd_party/faiss/VectorTransform.cpp
index 103b0910e..a26c2b4d3 100644
--- a/src/3rd_party/faiss/VectorTransform.cpp
+++ b/src/3rd_party/faiss/VectorTransform.cpp
@@ -132,7 +132,18 @@ const float *fvecs_maybe_subsample(
   return x_subset;
 }
 
-#if 1 // def __SSE__
+float fvec_norm_L2sqr_ref(const float *x, size_t d)
+{
+    size_t i;
+    double res = 0;
+    for (i = 0; i < d; i++)
+       res += x[i] * x[i];
+    return res;
+}
+
+
+
+#ifdef __SSE__
 // reads 0 <= d < 4 floats as __m128
 static inline __m128 masked_read(int d, const float *x)
 {
diff --git a/src/3rd_party/ruy b/src/3rd_party/ruy
new file mode 160000
index 000000000..2d950b3bf
--- /dev/null
+++ b/src/3rd_party/ruy
@@ -0,0 +1 @@
+Subproject commit 2d950b3bfa7ebfbe7a97ecb44b1cc4da5ac1d6f0
diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece
index 3ffdc0065..7f669c3f8 160000
--- a/src/3rd_party/sentencepiece
+++ b/src/3rd_party/sentencepiece
@@ -1 +1 @@
-Subproject commit 3ffdc0065a03cadd9d0e5e123aaf9b6ea7ffb05d
+Subproject commit 7f669c3f8f5fcc288838f3beba88a04533824f73
diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils
new file mode 160000
index 000000000..696036258
--- /dev/null
+++ b/src/3rd_party/simd_utils
@@ -0,0 +1 @@
+Subproject commit 6960362584481c977cdae9f6a8f7061a37c766cb
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9123e2324..76aa0e2b3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,6 +7,7 @@ include_directories(3rd_party/sentencepiece)
 include_directories(3rd_party/sentencepiece/third_party/protobuf-lite)
 include_directories(3rd_party/fbgemm/include)
 include_directories(3rd_party/intgemm)
+include_directories(3rd_party/ruy)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party/intgemm)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party)
 include_directories(${CMAKE_BINARY_DIR}/local/include)
@@ -99,12 +100,15 @@ set(MARIAN_SOURCES
   $<TARGET_OBJECTS:pathie-cpp>
 )
 
-if (NOT USE_WASM_COMPATIBLE_SOURCE)
+if (NOT USE_WASM_COMPATIBLE_SOURCE AND NOT ANDROID)
   list(APPEND MARIAN_SOURCES
     3rd_party/ExceptionWithCallStack.cpp
+  )
+endif()
 
+if (NOT USE_WASM_COMPATIBLE_SOURCE)
+  list(APPEND MARIAN_SOURCES
     data/corpus_sqlite.cpp
-    tensors/cpu/fbgemm/packed_gemm.cpp
     layers/lsh.cpp
     optimizers/quantizer.cpp
 
@@ -122,6 +126,11 @@ if (NOT USE_WASM_COMPATIBLE_SOURCE)
     $<TARGET_OBJECTS:zlib>
     $<TARGET_OBJECTS:faiss>
   )
+  if(USE_FBGEMM)
+    list(APPEND MARIAN_SOURCES
+      tensors/cpu/fbgemm/packed_gemm.cpp
+    )
+  endif(USE_FBGEMM)
 endif()
 
 add_library(marian STATIC ${MARIAN_SOURCES})
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 9e90bbb59..3bcc6518e 100755
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -15,16 +15,6 @@
 #include <stdexcept>
 #include <string>
 
-#if MKL_FOUND
-#include <mkl.h>
-#elif BLAS_FOUND
-  #if WASM_COMPATIBLE_BLAS
-    #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
-  #else
-    #include <cblas.h>
-  #endif // WASM_COMPATIBLE_BLAS
-#endif
-
 namespace marian {
 
 // TODO: Move this to CLIWrapper and allow to mark options as paths in the same place they are
diff --git a/src/common/logging.cpp b/src/common/logging.cpp
index 999b97b42..c21cf4207 100644
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@@ -128,7 +128,7 @@ static void setErrorHandlers() {
   std::set_terminate(unhandledException);
 #ifdef __unix__
   // catch segfaults
-  struct sigaction sa = { {0} };
+  struct sigaction sa = {};
   sigemptyset(&sa.sa_mask);
   sa.sa_flags = SA_SIGINFO;
   sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };
@@ -149,8 +149,8 @@ void switchtoMultinodeLogging(std::string nodeIdStr) {
 
 namespace marian {
   std::string noinline getCallStack(size_t skipLevels) {
-  #ifdef WASM_COMPATIBLE_SOURCE
-    return "Callstacks not supported in WASM builds currently";
+  #if defined(WASM_COMPATIBLE_SOURCE) || defined(__ANDROID__)
+    return "Callstacks not supported in WASM or Android builds currently";
   #else
     return ::Microsoft::MSR::CNTK::DebugUtil::GetCallStack(skipLevels + 2, /*makeFunctionNamesStandOut=*/true);
   #endif
diff --git a/src/common/types.h b/src/common/types.h
index 575e77120..6052896fc 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -17,7 +17,17 @@
 #include <type_traits>
 
 #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code
+
+#ifdef __AVX__
 #include <immintrin.h>
+#elif __SSE__
+#include <xmmintrin.h>
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+
 #endif
 
 #ifdef __CUDACC__ // nvcc is compiling this code
@@ -161,6 +171,43 @@ struct intgemm8 {
 
 #ifndef __CUDACC__ // vectorized types not available from .cu files
 
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+// The following struct fills this structure for ARM with NEON SIMD, changing
+// __m128 and _mm_set1_ps with the equivalents on ARM-NEON.
+struct float32x4 {
+private:
+    // NEON uses 128-bit SIMD registers, same as SSE. We are copying this class
+    // and locally aliasing __m128 to float32x4_t, which is the NEON
+    // equivalent.
+   using __m128 = float32x4_t;
+  __m128 f_;
+
+public:
+  float32x4() {}
+  float32x4(const __m128& f) : f_(f) {}
+  // __m128 _mm_set1_ps(float) copies value into all slots, vdupq_n_f32 is it's
+  // NEON equivalent.
+  float32x4(const float& f) : f_(vdupq_n_f32(f)) {} 
+
+  operator const __m128&() const { return f_; }
+  operator __m128&() { return f_; }
+
+  float operator[] (size_t i) const {
+    return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, float32x4 f4) {
+    float* a = (float*)&f4;
+    out << "[" << a[0];
+    for(int i = 1; i < 4; i++)
+      out << " " << a[i];
+    out << "]";
+    return out;
+  }
+};
+
+#else 
 // @TODO: check what intrinsics are actually available.
 struct float32x4 {
 private:
@@ -188,6 +235,8 @@ struct float32x4 {
   }
 };
 
+#endif
+
 // @TODO: consider how code can be shared via templating
 #ifdef __AVX__
 struct float32x8 {
diff --git a/src/functional/operators.h b/src/functional/operators.h
index d79ac3c05..1a67e22a1 100755
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -213,7 +213,15 @@ struct Ops<double> {
 // __CUDA_ARCH__ is defined when compiling device (GPU) code
 #ifndef __CUDACC__
 
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#include "3rd_party/simd_utils/simd_utils.h"
+#pragma GCC diagnostic pop
+#else
 #include "3rd_party/sse_mathfun.h"
+#endif
+
 
 namespace marian {
 namespace functional {
diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp
index a91778ed5..19b040ab9 100644
--- a/src/layers/lsh.cpp
+++ b/src/layers/lsh.cpp
@@ -3,7 +3,7 @@
 #include "tensors/cpu/prod_blas.h"
 
 #if BLAS_FOUND
-#include "3rd_party/faiss/IndexLSH.h"
+#include "faiss/IndexLSH.h"
 #endif
 
 namespace marian {
@@ -127,4 +127,4 @@ Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) {
 }
 #endif
 
-}  // namespace marian
\ No newline at end of file
+}  // namespace marian
diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h
index 583a2f792..a571cbc80 100644
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@@ -78,14 +78,27 @@ class Backend : public marian::Backend {
   void setInt8(bool optimize) override { int8_ = optimize; }
   bool isInt8() override { return int8_; }
 
-  void setShifted(bool shifted) override { shifted_ = shifted; }
+  void setShifted(bool shifted) override { 
+#if (defined(__arm__) || defined(__aarch64__))
+      LOG(info, "gemm-precision: *shifted* is not available on ARM; Setting to false.");
+      shifted_ = false;
+#else
+      shifted_ = shifted; 
+#endif
+  }
   bool isShifted() override { return shifted_; }
 
   void setShiftedAll(bool shiftedAll) override {
+#if (defined(__arm__) || defined(__aarch64__))
+      LOG(info, "gemm-precision: *shifted* is not available on ARM; Setting to false.");
+      shiftedAll_ = false;
+      shifted_ = false;
+#else
     shiftedAll_ = shiftedAll;
     if (shiftedAll_) {
       shifted_ = true;
     }
+#endif
   }
 
   bool isShiftedAll() override {
diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h
index d93719d8e..4af120e4d 100644
--- a/src/tensors/cpu/expression_graph_packable.h
+++ b/src/tensors/cpu/expression_graph_packable.h
@@ -209,7 +209,7 @@ class ExpressionGraphPackable : public ExpressionGraph {
         if (gemmElementType == Type::intgemm8) {
 #if defined(WASM)
           ABORT("Int8::PrepareA is not implemented for wasm.");
-#else
+#elif defined(USE_INTGEMM)
           float quantMult = 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements());
           intgemm::Int8::PrepareA(tmp->data(), /*input*/
                                 paramMat->data<int8_t>(), /*output*/
@@ -218,11 +218,13 @@ class ExpressionGraphPackable : public ExpressionGraph {
                                 cols(val));
           //Put the quantMult at the back of the tensor
           *(reinterpret_cast<float *>(paramMat->data<int8_t>() + val->shape().elements())) = quantMult;
+#else
+    ABORT("Int8::PrepareA not implemented yet for ruy");
 #endif
         } else {
 #if defined(WASM)
           ABORT("Int16::PrepareA is not implemented for wasm.");
-#else
+#elif defined(USE_INTGEMM)
           float quantMult = 1024.0f;
           intgemm::Int16::PrepareA(tmp->data(), /*input*/
                                 paramMat->data<int16_t>(), /*output*/
@@ -231,6 +233,8 @@ class ExpressionGraphPackable : public ExpressionGraph {
                                 cols(val));
           //Put the quantMult at the back of the tensor
           *(reinterpret_cast<float *>(paramMat->data<int16_t>() + val->shape().elements())) = quantMult;
+#else
+      ABORT("Int16::PrepareA is not implemented for wasm.");
 #endif
         }
 
diff --git a/src/tensors/cpu/integer_common.cpp b/src/tensors/cpu/integer_common.cpp
index 21e7254fa..edaeb112c 100644
--- a/src/tensors/cpu/integer_common.cpp
+++ b/src/tensors/cpu/integer_common.cpp
@@ -1,5 +1,14 @@
 #include "integer_common.h"
 
+#ifdef __SSE__
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+
 namespace marian {
 namespace cpu {
 namespace integer {
@@ -14,7 +23,9 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) {
 
   for(int j = 0; j < m; ++j) {
     int i = 0;
+
 #ifdef __AVX512F__
+    // Multiples of 16 add together.
     int n16 = n & ~15;
     for(; i < n16; i += 16) {
       __m512 ai = _mm512_loadu_ps(x + j * n + i);
@@ -22,7 +33,8 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) {
       __m512 yi = _mm512_add_ps(ai, bi);
       _mm512_storeu_ps(y + j * n + i, yi);
     }
-#else
+#elif __SSE__
+    // Multiples of 4 add together.
     int n4 = (n / 4) * 4;
     for(; i < n4; i += 4) {
       __m128 ai = _mm_loadu_ps(x + j * n + i);
@@ -30,6 +42,21 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) {
       __m128 yi = _mm_add_ps(ai, bi);
       _mm_storeu_ps(y + j * n + i, yi);
     }
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int n4 = (n / 4) * 4;
+    using __m128 = float32x4_t;
+    for(; i < n4; i += 4) {
+      __m128 ai = vld1q_f32(x + j * n + i);
+      __m128 bi = vld1q_f32(bias + i);
+      __m128 yi = vaddq_f32(ai, bi);
+      vst1q_f32(y + j * n + i, yi);
+    }
+
+#else
+    // StandardCPP No SIMD case.
+    for(i = 0;  i < n; i++) {
+        y[j * n + i] = x[j * n + i] + bias[i];
+    }
 #endif
     for(; i < n; i++) {
       y[j * n + i] = x[j * n + i] + bias[i];
@@ -39,4 +66,4 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) {
 
 } //integer
 } //cpu
-} //marian
\ No newline at end of file
+} //marian
diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h
index 97ca79c12..d05440ef1 100644
--- a/src/tensors/cpu/integer_common.h
+++ b/src/tensors/cpu/integer_common.h
@@ -4,15 +4,20 @@
 #include "tensors/tensor_operators.h"
 #include "tensors/cpu/aligned.h"
 #include "common/io_item.h"
+#ifdef USE_INTGEMM
 #include "3rd_party/intgemm/intgemm/intgemm.h"
+#else // USE_INTGEMM
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcomment"
+#include <ruy/ruy.h>
+#pragma GCC diagnostic pop
+
+#include "ruy_adapter.h"
+#endif // USE_INTGEMM
 #if defined(WASM)
 #include "wasm_intgemm_interface.h"
 #endif
 
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
 #include <cassert>
 #include <cstddef>
 
@@ -27,6 +32,11 @@ inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tenso
 inline int cols(Shape& shape) { return shape[-1]; }
 inline int rows(Shape& shape) { return shape.elements() / cols(shape); }
 
+// This operates on floats after processing so doesn't care about int8_t vs int16_t.
+void AddBias(marian::Tensor C, const marian::Tensor Bias);
+
+#ifdef USE_INTGEMM
+
 template<Type type> struct intgemm_;
 template <> struct intgemm_<Type::int8> {using width = intgemm::Int8;
                                          using type = int8_t;
@@ -35,8 +45,19 @@ template <> struct intgemm_<Type::int16> {using width = intgemm::Int16;
                                           using type = int16_t;
                                           constexpr static const Type intgemmType = Type::intgemm16;};
 
-// This operates on floats after processing so doesn't care about int8_t vs int16_t.
-void AddBias(marian::Tensor C, const marian::Tensor Bias);
+
+
+#else // USE_INTGEMM
+
+template<Type type> struct intgemm_;
+template <> struct intgemm_<Type::int8> {using width = IntgemmViaRuy::Int8;
+                                         using type = IntgemmViaRuy::Int8::Type;
+                                         constexpr static const Type intgemmType = Type::intgemm8;};
+template <> struct intgemm_<Type::int16> {using width = IntgemmViaRuy::Int16;
+                                          using type = IntgemmViaRuy::Int16::Type;
+                                          constexpr static const Type intgemmType = Type::intgemm16;};
+
+#endif // USE_INTGEMM
 
 // For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed
 // in our binary format. Then we copy the quantizationMultiplier information at the end
@@ -86,7 +107,7 @@ void prepareAndTransposeB(io::Item& item, const char * input) {
     //Copy the quantMult
     float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input) + item.shape.elements()));
     *(reinterpret_cast<float *>(&(*(output_tensor + item.shape.elements())))) = quantMult;
-    #else
+    #else // COMPILE_CPU
     ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on");
     #endif
 }
@@ -106,4 +127,4 @@ void unquantizeWemb(io::Item& item, const char * input) {
 
 } //integer
 } //cpu
-} //marian
\ No newline at end of file
+} //marian
diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h
index 4af960cd6..865c97d3f 100644
--- a/src/tensors/cpu/intgemm_interface.h
+++ b/src/tensors/cpu/intgemm_interface.h
@@ -42,7 +42,7 @@ bool shifted_;
                   rows(child(0)->val()),
                   cols(child(0)->val()),
                   val_->data<int8_t>() /*output*/);
-  #else
+  #elif defined(USE_INTGEMM)
     typedef typename intgemm_<vtype>::type Integer;
       auto input = child(0)->val();
       if (!shifted_) {
@@ -58,6 +58,14 @@ bool shifted_;
                                       rows(child(0)->val()),
                                       cols(child(0)->val()));
       }
+  #else 
+  // Copied from above. No shifted in ARM.
+    typedef typename intgemm_<vtype>::type Integer;
+    intgemm_<vtype>::width::PrepareA(child(0)->val()->data(), /*input*/
+                                      val_->data<Integer>(), /*output*/
+                                      *child(1)->val()->data(), /*Quant Mult*/
+                                      rows(child(0)->val()),
+                                      cols(child(0)->val()));
   #endif
   }};
 #else
@@ -259,8 +267,8 @@ struct QuantMultNodeOp : public UnaryNodeOp {
 #pragma warning(push)
 #pragma warning(disable: 4127) //VSCODE thinks line 222 is constant conditional expression, which it is only after the template resolution, not before.
   NodeOps forwardOps() override {
-#ifdef COMPILE_CPU
-    return {NodeOp(
+    return  {[=](){
+    #ifdef COMPILE_CPU
       if (vtype == Type::int16) {
         *val_->data() = 1024.0f;
       } else if (child(0)->type() == "intgemmSelectColumnsB") {
@@ -270,18 +278,22 @@ struct QuantMultNodeOp : public UnaryNodeOp {
         *val_->data() = *(reinterpret_cast<float *>(reinterpret_cast<Integer *>(child(0)->val()->data()) + child(0)->val()->shape().elements()));
       } else {
         if (child(0)->graph()->getBackend()->DumpQuantMult()) {
+          #if defined(USE_INTGEMM)
           intgemm::MeanStd meanstd = intgemm::GetVectorMeanStd(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements(), true);
           intgemm::MeanStd meanstd2 = intgemm::GetVectorMeanStd(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements());
           std::cerr << "Name: " << name() << " MeanAbs: " << meanstd.mean << " stddevAbs: " << meanstd.stddev << " Mean: " << meanstd2.mean << " stddev: "
           << meanstd2.stddev << " MaxAbs: " << intgemm::MaxAbsolute(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements()) << std::endl;
+          #endif
         }
         auto input = child(0)->val();
-        *val_->data() = 127.0f / intgemm::MaxAbsolute(input->data(), input->data() + input->size());
+        #if defined(USE_INTGEMM)
+          *val_->data() = 127.0f / intgemm::MaxAbsolute(input->data(), input->data() + input->size());
+        #else
+          *val_->data() = 127.0f / IntgemmViaRuy::MaxAbsolute(input->data(), input->data() + input->size());
+        #endif
       }
-    )};
-#else
-    return {NodeOp()};
-#endif
+    #endif // COMPILE_CPU
+    }};
   }
 #pragma warning(pop)
   NodeOps backwardOps() override {
@@ -347,9 +359,11 @@ class PrepareBiasForBNodeOp : public NaryNodeOp {
         float scale_a = *quant_mult_a->data();
         float scale_b = *quant_mult_b->data();
         int8PrepareBias((const int8_t *)b->data(), scale_a, 0.0 /*zero_point_a*/, scale_b, 0.0 /*zero_point_b*/, rows(b), cols(b), bias->data(), val_->data());
-    #else
+    #elif defined(USE_INTGEMM)
         float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on
         intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias->data(), val_->data()));
+    #else
+        ABORT("PrepareBias should not be called on ARM");
     #endif
       }
     }};
@@ -384,9 +398,11 @@ class PrepareFakeBiasForBNodeOp : public NaryNodeOp {
     float scale_a = *quant_mult_a->data();
     float scale_b = *quant_mult_b->data();
     int8PrepareBias((const int8_t *)b->data(), scale_a, 0.0 /*zero_point_a*/, scale_b, 0.0 /*zero_point_b*/, rows(b), cols(b), nullptr/*input_bias*/, val_->data());
-  #else
+  #elif defined(USE_INTGEMM)
     float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on
     intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data()));
+  #else
+    // Not sure what's going on here. 
   #endif
     }};
 #else
@@ -435,7 +451,7 @@ float scalar_;
               "Int16::Multiply is not implemented for wasm.");
           ABORT_IF(intgemm_<vtype>::intgemmType == Type::intgemm8,
               "Int8::Multiply is not implemented for wasm.");
-      #else
+      #elif defined(USE_INTGEMM)
           typedef typename intgemm_<vtype>::type Integer;
           intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
                                            reinterpret_cast<Integer *>(child(1)->val()->data()), /*B*/
@@ -443,6 +459,17 @@ float scalar_;
                                            cols(child(0)->val()),
                                            cols(child(1)->val()),
                                            intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data()));
+      #else
+        typedef typename intgemm_<vtype>::type Integer;
+        auto callback = marian::cpu::integer::UnquantizeAndWrite(unquant_mult);
+        intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
+                                   reinterpret_cast<Integer *>(child(1)->val()->data()), /*B*/
+                                   val_->data(), /*output*/
+                                   rows(child(0)->val()),
+                                   cols(child(0)->val()),
+                                   cols(child(1)->val()),                                          
+                                   callback);
+
       #endif
     }};
 #else
@@ -509,7 +536,7 @@ class AffineNodeOp : public NaryNodeOp {
                                 cols(child(0)->val()),
                                 cols(child(1)->val()),
                                 val_->data());
-      #else
+      #elif defined(USE_INTGEMM)
           typedef typename intgemm_<vtype>::type Integer;
           if (!shifted_) {
             intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
@@ -526,6 +553,19 @@ class AffineNodeOp : public NaryNodeOp {
                                   cols(child(1)->val()),                                          /*child(2) is bias*/
                                   intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, child(2)->val()->data(), val_->data()));
           }
+      #else
+        typedef typename intgemm_<vtype>::type Integer;
+        auto callback = marian::cpu::integer::UnquantizeAndAddBiasAndWrite(unquant_mult, 
+                                   child(2)->val()->data()  /*child(2) is bias*/);
+        intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
+                                   reinterpret_cast<Integer *>(child(1)->val()->data()), /*B*/
+                                   val_->data(), /*output*/
+                                   rows(child(0)->val()),
+                                   cols(child(0)->val()),
+                                   cols(child(1)->val()),                                          
+                                   callback);
+
+
       #endif
     }};
 #else
diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp
index 4d761cf4b..8cc030539 100755
--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@@ -10,11 +10,9 @@
 #if MKL_FOUND
 #include <mkl.h>
 #elif BLAS_FOUND
-  #if WASM_COMPATIBLE_BLAS
-    #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
-  #else
-    #include <cblas.h>
-  #endif // WASM_COMPATIBLE_BLAS
+   #include <cblas.h>
+#elif USE_ONNX_SGEMM
+  #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
 #endif
 
 #include "integer_common.h"
@@ -79,7 +77,6 @@ void ProdBatchedOld(marian::Tensor C,
                  bool transB,
                  float beta,
                  float scalar) {
-#if BLAS_FOUND
   float alpha = scalar;
 
   size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]);
@@ -183,10 +180,6 @@ void ProdBatchedOld(marian::Tensor C,
           (int)ldc);
   }
 #endif
-#else
-  C; A; B; transA; transB; beta; scalar;
-  ABORT("You need to compile with MKL in order to use the CPU version");
-#endif
 }
 
 void ProdBatched(marian::Tensor C,
diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h
index 1d6757927..61053b402 100644
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@@ -1,13 +1,126 @@
+#pragma once
 #if MKL_FOUND
-#include <mkl.h>
+    #include <mkl.h>
 #elif BLAS_FOUND
-  #if WASM_COMPATIBLE_BLAS
-    #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
-  #else
     #include <cblas.h>
-  #endif // WASM_COMPATIBLE_BLAS
+#elif USE_ONNX_SGEMM
+    #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
+#elif USE_RUY_SGEMM
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcomment"
+    #include "ruy/ruy.h"
+    #include "ruy/system_aligned_alloc.h"
+#pragma GCC pop
 #endif
 
+#if USE_RUY_SGEMM
+
+// AlignedVector allocates aligned memory and cleans up after itself. RAII
+// wrapper similar to intgemm's AlignedVector.
+template <class T>
+class AlignedVector {
+public:
+  AlignedVector(size_t num_elem)
+      : size_(num_elem),
+        storage_(reinterpret_cast<T *>(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {}
+
+  T *begin() { return storage_; }
+  T *data() { return storage_; }
+  size_t size() const { return size_; }
+  size_t memSize() const { return sizeof(T) * size_; }
+
+  // Forbid copy
+  AlignedVector(const AlignedVector &) = delete;
+  AlignedVector &operator=(const AlignedVector &) = delete;
+
+  ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast<void *>(storage_)); }
+
+private:
+  size_t size_;
+  T *storage_;
+};
+
+
+inline void GemmRuy(const bool transA,
+                    const bool transB,
+                    const int M,
+                    const int N,
+                    const int K,
+                    const float alpha,
+                    const float *A,
+                    const int lda,
+                    const float *B,
+                    const int ldb,
+                    const float beta,
+                    float *C,
+                    const int ldc) {
+  ruy::Context context;
+
+  // If we need to transpose, we can swap dimensions in layout claim the matrix
+  // is just column-major. Set ordering so transpose.
+  const auto orderA = (transA ? ruy::Order::kColMajor : ruy::Order::kRowMajor);
+  const auto orderB = (transB ? ruy::Order::kColMajor : ruy::Order::kRowMajor);
+
+  ruy::Matrix<float> lhs;
+  ruy::MakeSimpleLayout(M, K, orderA, lhs.mutable_layout());
+  lhs.set_data(A);
+
+  ruy::Matrix<float> rhs;
+  ruy::MakeSimpleLayout(K, N, orderB, rhs.mutable_layout());
+  rhs.set_data(B);
+
+  ruy::Matrix<float> dst;
+  ruy::MakeSimpleLayout(M, N, ruy::Order::kRowMajor, dst.mutable_layout());
+
+  if(beta == 0) {
+    // For beta = 0, we want to avoid the additional allocation. This is a
+    // large amount of our inference use-cases. sgemm is called with `beta` for
+    // accumulating gradients in backpropogation, which is 0.0 during
+    // inference.
+
+    dst.set_data(C);
+    ruy::MulParams<float, float> mul_params;
+    ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+    if(alpha != 1.0) {
+        // Write out C as C = alpha * [op(A) * op(B)] + beta * C
+        // Can we expect the compiler to autovectorize this?
+        // TODO: Come back and explicitly use SIMD.
+        const size_t size    = M * N;
+        const float *opA_opB = C;  // Alias.
+        for(size_t i = 0; i < size; i++) {
+          C[i] = alpha * opA_opB[i];
+        }
+    }
+
+  } else {
+    // @jerinphilip has not yet been able to find a ruy primitive that does in
+    // place addition to obtain full gemm.
+    //
+    // Safe bet is to make an additional allocation to store the result of
+    // multiply  and use the existing values in C.
+    //
+    // See also: https://github.com/google/ruy/issues/307
+
+    AlignedVector<float> intermediate(M * N);
+    dst.set_data(intermediate.data());
+    ruy::MulParams<float, float> mul_params;
+    ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+    // Write out C as C = alpha * [op(A) * op(B)] + beta * C
+    // Can we expect the compiler to autovectorize this?
+    // TODO: Come back and explicitly use SIMD.
+    const size_t size    = M * N;
+    const float *opA_opB = intermediate.data();
+    for(size_t i = 0; i < size; i++) {
+      C[i] = alpha * opA_opB[i] + beta * C[i];
+    }
+  }
+}
+
+#endif // RUY_SGEMM
+
+
 inline void sgemm(bool transA,
                   bool transB,
                   int rows_a,
@@ -22,9 +135,6 @@ inline void sgemm(bool transA,
                   float* c,
                   int ldc) {
 #if BLAS_FOUND
-    #if WASM_COMPATIBLE_BLAS
-        gemm_f32_imp(transA, transB, rows_a, rows_b, width, alpha, a, b, beta, c);
-    #else
         cblas_sgemm(CblasRowMajor,
                     transA ? CblasTrans : CblasNoTrans,
                     transB ? CblasTrans : CblasNoTrans,
@@ -39,8 +149,23 @@ inline void sgemm(bool transA,
                     beta,
                     c,
                     ldc);
-    #endif
-#else
+#elif USE_ONNX_SGEMM
+        gemm_f32_imp(transA, transB, rows_a, rows_b, width, alpha, a, b, beta, c);
+#elif USE_RUY_SGEMM
+        GemmRuy(transA, 
+                transB,
+                rows_a,
+                rows_b,
+                width,
+                alpha,
+                a,
+                lda,
+                b,
+                ldb,
+                beta,
+                c,
+                ldc);
+#else 
     transA; transB; rows_a; rows_b; width; alpha; a; lda; b; ldb; beta; c; ldc; // make compiler happy
     ABORT("Marian must be compiled with a BLAS library");
 #endif
diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
new file mode 100644
index 000000000..ad6bc1ce6
--- /dev/null
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -0,0 +1,377 @@
+/*
+ * This file follows intgemm and is a means of retrofitting ruy into the intgemm based wiring in
+ * `intgemm_interface.h`. ruy is an inference backend used in tensorflow and android deployment and
+ * has an optimized ARM backend for the multiply operations required. Optimized code for quantize,
+ * unquantize, transpose are added separately to connect the multiply library to marian.
+ */
+
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcomment"
+#include "ruy/platform.h"
+#include "ruy/system_aligned_alloc.h"
+#pragma GCC diagnostic pop
+
+#if RUY_PLATFORM_NEON
+#include <arm_neon.h>
+#endif
+
+namespace marian {
+namespace cpu {
+namespace integer {
+
+using Index = unsigned int;
+
+#if RUY_PLATFORM_NEON
+
+/*
+ * Optimized path using ARM NEON SIMD intrinsics. Currently only supports int8_t.
+ */
+inline void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) {
+  const float32x4_t *Input = reinterpret_cast<const float32x4_t *>(input);
+  const float32x4_t *InputEnd = reinterpret_cast<const float32x4_t *>(input + rows * width);
+
+  int8x8_t *Output = reinterpret_cast<int8x8_t *>(output);
+  while(Input != InputEnd) {
+    // Vector multiply by scalar
+    // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);
+    // VMUL.F32 q0,q0,d0[0]
+    float32x4_t scaledFloat_lo = vmulq_n_f32(*Input++, scale);
+
+    // Convert from float
+    // int32x4_t  vcvtnq_s32_f32(float32x4_t a);
+    // VCVT.S32.F32 q0, q0
+    int32x4_t scaledInt_lo = vcvtnq_s32_f32(scaledFloat_lo);
+
+    // Vector saturating narrow integer
+    // int16x4_t  vqmovn_s32(int32x4_t a);   // VQMOVN.S32 d0,q0
+    int16x4_t s16x4_lo = vqmovn_s32(scaledInt_lo);
+
+    // Vector multiply by scalar
+    // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);
+    // VMUL.F32 q0,q0,d0[0]
+    float32x4_t scaledFloat_hi = vmulq_n_f32(*Input++, scale);
+
+    // Convert from float
+    // int32x4_t  vcvtnq_s32_f32(float32x4_t a);
+    // VCVT.S32.F32 q0, q0
+    int32x4_t scaledInt_hi = vcvtnq_s32_f32(scaledFloat_hi);
+
+    // Vector saturating narrow integer
+    // int16x4_t  vqmovn_s32(int32x4_t a);
+    // VQMOVN.S32 d0,q0
+    int16x4_t s16x4_hi = vqmovn_s32(scaledInt_hi);
+
+    // Combine two ints.
+    // int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high);
+    int16x8_t s16x8 = vcombine_s16(s16x4_lo, s16x4_hi);
+
+    // Vector saturating narrow integer
+    int8x8_t s8x8 = vqmovn_s16(s16x8);
+
+    *Output = s8x8;
+    ++Output;
+  };
+}
+
+inline void _transpose_16x16(const int8_t *src,
+                             Index i,
+                             Index j,
+                             Index rows,
+                             Index cols,
+                             int8_t *dst) {
+  // Implemented following the algorithm described in
+  // https://stackoverflow.com/a/29587984/4565794
+  //
+  // permute n 32-bit rows
+  // permute n 64-bit rows
+  // ...
+  // permute n simd_width/2-bit rows
+
+  // clang-format off
+    
+    // Permute 8 8-bit rows.
+    // Load int8x16x2 from memory into SIMD registers, transpose as 2x2 matrices.
+
+    Index srcRowBegin = i*cols + j;
+    int8x16x2_t r0 = vtrnq_s8(vld1q_s8(&src[ 0*cols + srcRowBegin]), vld1q_s8(&src[ 1*cols + srcRowBegin]));
+    int8x16x2_t r1 = vtrnq_s8(vld1q_s8(&src[ 2*cols + srcRowBegin]), vld1q_s8(&src[ 3*cols + srcRowBegin]));
+    int8x16x2_t r2 = vtrnq_s8(vld1q_s8(&src[ 4*cols + srcRowBegin]), vld1q_s8(&src[ 5*cols + srcRowBegin]));
+    int8x16x2_t r3 = vtrnq_s8(vld1q_s8(&src[ 6*cols + srcRowBegin]), vld1q_s8(&src[ 7*cols + srcRowBegin]));
+    int8x16x2_t r4 = vtrnq_s8(vld1q_s8(&src[ 8*cols + srcRowBegin]), vld1q_s8(&src[ 9*cols + srcRowBegin]));
+    int8x16x2_t r5 = vtrnq_s8(vld1q_s8(&src[10*cols + srcRowBegin]), vld1q_s8(&src[11*cols + srcRowBegin]));
+    int8x16x2_t r6 = vtrnq_s8(vld1q_s8(&src[12*cols + srcRowBegin]), vld1q_s8(&src[13*cols + srcRowBegin]));
+    int8x16x2_t r7 = vtrnq_s8(vld1q_s8(&src[14*cols + srcRowBegin]), vld1q_s8(&src[15*cols + srcRowBegin]));
+
+
+    // Permute 8 16-bit rows.
+    // Next step is to treat the entries as int16x8x2 (via cast) and do
+    // transpose for int16, which will now leave intra-2 pairs intact while
+    // transposing inter 2-pairs into the right places.
+    int16x8x2_t t0 = vtrnq_s16(vreinterpretq_s16_s8(r0.val[0]), vreinterpretq_s16_s8(r1.val[0]));
+    int16x8x2_t t1 = vtrnq_s16(vreinterpretq_s16_s8(r2.val[0]), vreinterpretq_s16_s8(r3.val[0]));
+    int16x8x2_t t2 = vtrnq_s16(vreinterpretq_s16_s8(r4.val[0]), vreinterpretq_s16_s8(r5.val[0]));
+    int16x8x2_t t3 = vtrnq_s16(vreinterpretq_s16_s8(r6.val[0]), vreinterpretq_s16_s8(r7.val[0]));
+    int16x8x2_t t4 = vtrnq_s16(vreinterpretq_s16_s8(r0.val[1]), vreinterpretq_s16_s8(r1.val[1]));
+    int16x8x2_t t5 = vtrnq_s16(vreinterpretq_s16_s8(r2.val[1]), vreinterpretq_s16_s8(r3.val[1]));
+    int16x8x2_t t6 = vtrnq_s16(vreinterpretq_s16_s8(r4.val[1]), vreinterpretq_s16_s8(r5.val[1]));
+    int16x8x2_t t7 = vtrnq_s16(vreinterpretq_s16_s8(r6.val[1]), vreinterpretq_s16_s8(r7.val[1]));
+
+    // Permute 8 32-bit rows.
+    int32x4x2_t x0 = vtrnq_s32(vreinterpretq_s32_s16(t0.val[0]), vreinterpretq_s32_s16(t1.val[0]));
+    int32x4x2_t x1 = vtrnq_s32(vreinterpretq_s32_s16(t4.val[0]), vreinterpretq_s32_s16(t5.val[0]));
+    int32x4x2_t x2 = vtrnq_s32(vreinterpretq_s32_s16(t0.val[1]), vreinterpretq_s32_s16(t1.val[1]));
+    int32x4x2_t x3 = vtrnq_s32(vreinterpretq_s32_s16(t4.val[1]), vreinterpretq_s32_s16(t5.val[1]));
+
+    int32x4x2_t x4 = vtrnq_s32(vreinterpretq_s32_s16(t2.val[0]), vreinterpretq_s32_s16(t3.val[0]));
+    int32x4x2_t x5 = vtrnq_s32(vreinterpretq_s32_s16(t6.val[0]), vreinterpretq_s32_s16(t7.val[0]));
+    int32x4x2_t x6 = vtrnq_s32(vreinterpretq_s32_s16(t2.val[1]), vreinterpretq_s32_s16(t3.val[1]));
+    int32x4x2_t x7 = vtrnq_s32(vreinterpretq_s32_s16(t6.val[1]), vreinterpretq_s32_s16(t7.val[1]));
+
+    // There is no permute 8 64-bit rows available. 
+    // Instead we follow extracting low and high and placing them into the right places.
+    Index dstRowBegin = j*rows + i;
+    vst1q_s8(&dst[ 0*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x0.val[0]),  vget_low_s32(x4.val[0])))); 
+    vst1q_s8(&dst[ 1*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x1.val[0]),  vget_low_s32(x5.val[0]))));
+    vst1q_s8(&dst[ 2*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x2.val[0]),  vget_low_s32(x6.val[0]))));
+    vst1q_s8(&dst[ 3*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x3.val[0]),  vget_low_s32(x7.val[0]))));
+    vst1q_s8(&dst[ 4*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x0.val[1]),  vget_low_s32(x4.val[1]))));
+    vst1q_s8(&dst[ 5*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x1.val[1]),  vget_low_s32(x5.val[1]))));
+    vst1q_s8(&dst[ 6*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x2.val[1]),  vget_low_s32(x6.val[1]))));
+    vst1q_s8(&dst[ 7*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x3.val[1]),  vget_low_s32(x7.val[1]))));
+
+    vst1q_s8(&dst[ 8*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x0.val[0]), vget_high_s32(x4.val[0]))));
+    vst1q_s8(&dst[ 9*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x1.val[0]), vget_high_s32(x5.val[0]))));
+    vst1q_s8(&dst[10*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x2.val[0]), vget_high_s32(x6.val[0]))));
+    vst1q_s8(&dst[11*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x3.val[0]), vget_high_s32(x7.val[0]))));
+    vst1q_s8(&dst[12*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x0.val[1]), vget_high_s32(x4.val[1]))));
+    vst1q_s8(&dst[13*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x1.val[1]), vget_high_s32(x5.val[1]))));
+    vst1q_s8(&dst[14*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x2.val[1]), vget_high_s32(x6.val[1]))));
+    vst1q_s8(&dst[15*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x3.val[1]), vget_high_s32(x7.val[1]))));
+
+  // clang-format on
+}
+
+// Specialization for int8_t
+inline void transpose(const int8_t *input, Index rows, Index cols, int8_t *output) {
+  constexpr Index tile_size = 16;
+  // TODO(jerin): Enable
+  // assert(rows % tile_size == 0 && cols & tile_size == 0);
+  for(Index i = 0; i < rows; i += tile_size) {
+    for(Index j = 0; j < cols; j += tile_size) {
+      _transpose_16x16(input, i, j, rows, cols, output);
+    }
+  }
+}
+
+struct UnquantizeAndAddBiasAndWrite {
+  UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared)
+      : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {}
+
+  void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
+    // Set all registers in lane from same scalar value.
+    float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_);
+    const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
+    const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
+    float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
+
+    while(Input != InputEnd) {
+      // Bias cycles every column for addition.
+      const float32x4_t *Bias = reinterpret_cast<const float32x4_t *>(input_bias_prepared_);
+
+      // InputEnd needs to be determined to end the while loop below.
+      const int32x4_t *RowEnd
+          = reinterpret_cast<const int32x4_t *>(reinterpret_cast<const int32_t *>(Input) + cols_B);
+
+      while(Input != RowEnd) {
+        // Operation happening for 4-elements together:
+        // output = [int32_t]input * [float]quant_mult + [float]bias;
+        float32x4_t floatInput = vcvtq_f32_s32(*Input++);
+        float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
+        *Output++ = vaddq_f32(unquantized, *Bias++);
+      }
+    }
+  }
+
+private:
+  float unquant_multiplier_;
+  const float *input_bias_prepared_;
+};
+
+struct UnquantizeAndWrite {
+  explicit UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {}
+
+  void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
+    // Set all registers in lane from same scalar value.
+    float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_);
+    const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
+    const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
+    float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
+
+    while(Input != InputEnd) {
+      // InputEnd needs to be determined to end the while loop below.
+      const int32x4_t *RowEnd
+          = reinterpret_cast<const int32x4_t *>(reinterpret_cast<const int32_t *>(Input) + cols_B);
+
+      while(Input != RowEnd) {
+        // Operation happening for 4-elements together:
+        // output = [int32_t]input * [float]quant_mult + [float]bias;
+        float32x4_t floatInput = vcvtq_f32_s32(*Input++);
+        float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
+        *Output++ = unquantized;
+      }
+    }
+  }
+
+private:
+  float unquant_multiplier_;
+};
+
+#endif
+
+/*
+ * The following nomenclature comes from intgemm. The current state of code is to keep the
+ * intgemm_interface.h diff minimal. There are possibly better abstractions.
+ */
+struct IntgemmViaRuy {
+  // Intgemm nomenclature expects Int8. Missing functions are ABORTs.
+  struct Int8 {
+    using Type = int8_t;
+    static void PrepareBQuantizedTransposed(const Type *input,
+                                            Type *output,
+                                            Index rows,
+                                            Index cols) {
+      std::memcpy(output, input, /*count=*/sizeof(Type) * (rows * cols));
+    }
+
+    static void PrepareBTransposed(const float *input,
+                                   Type *output,
+                                   float quant_mult,
+                                   Index rows,
+                                   Index cols) {
+      quantize(input, output, quant_mult, rows, cols);
+    }
+
+    static void PrepareB(const float *, Type *, float, Index, Index) {
+      ABORT("PrepareB Unsupported");
+    }
+
+    static void PrepareA(const float *input,
+                         int8_t *output,
+                         float quant_mult,
+                         Index rows,
+                         Index cols) {
+      quantize(input, output, quant_mult, rows, cols);
+    }
+
+    static void SelectColumnsB(const Type *input,
+                               Type *output,
+                               Index width,
+                               const Index *cols,
+                               const Index *cols_end) {
+      // B_prepared is expected to be col-major, for our implementation via ruy. If
+      // col-major we can memcpy the respective column entries as they're
+      // sequential. There are width=rows entries.
+      Index num_cols = static_cast<Index>(std::distance(cols, cols_end));
+      for(Index c = 0; c < num_cols; ++c) {
+        std::memcpy(&(output[c * width]), &(input[cols[c] * width]), width);
+      }
+    }
+
+    // We don't have callback an no-op capability here yet. Multiply is kept similar to Mozilla
+    // specification and there are overloads with and without bias to avoid an if inside. This
+    // method corresponds to the one with bias.
+    // output = A*B + bias
+    template <class Callback>
+    static void Multiply(const Type *input_A_prepared,
+                         const Type *input_B_prepared,
+                         float *output,
+                         Index rows_A,
+                         Index width,
+                         Index cols_B,
+                         Callback callback) {
+      // It is expected that somehow we have managed to call all prepare by the time
+      // we are here, with inputs (prepared) in int8_t. All that's left to do is use
+      // ruy for multiply and then start with the reverse ops to get to fp32.
+
+      // Use ruy to multiply.
+      // The following is adapted from
+      // https://github.com/google/ruy/blob/878283640de7946a43053e8ebf4f15114fbc9156/example/example.cc#L129-L152
+
+      ruy::Context context;
+      ruy::Matrix<std::int8_t> lhs;
+      ruy::MakeSimpleLayout(rows_A, width, ruy::Order::kRowMajor, lhs.mutable_layout());
+      lhs.set_data(input_A_prepared);
+
+      ruy::Matrix<std::int8_t> rhs;
+      ruy::MakeSimpleLayout(width, cols_B, ruy::Order::kColMajor, rhs.mutable_layout());
+      rhs.set_data(input_B_prepared);
+
+      ruy::Matrix<std::int32_t> dst;
+      ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout());
+
+      std::int32_t *dest_ptr = reinterpret_cast<std::int32_t *>(output);
+      dst.set_data(dest_ptr);
+
+      // When Dst is int32, mul_params is unused.
+      ruy::MulParams<std::int32_t, std::int32_t> mul_params;
+      ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+      callback(dest_ptr, rows_A, cols_B, output);
+    }
+  };
+
+  // Int16 support is currently missing.
+  struct Int16 {
+    using Type = int16_t;
+    static void Quantize(const float *, Type *, float, Index) { ABORT("Quantize unsupported"); }
+
+    static void PrepareA(const float *input,
+                         Type *output,
+                         float quant_mult,
+                         Index rows,
+                         Index cols) {
+      ABORT("PrepareA Unsupported");
+    }
+
+    static void PrepareB(const float *, Type *, float, Index, Index) {
+      ABORT("PrepareB Unsupported");
+    }
+    static void PrepareBQuantizedTransposed(const Type *, Type *, Index, Index) {
+      ABORT("PrepareBQuantizedTransposed Unsupported");
+    }
+    static void PrepareBTransposed(const float *, Type *, float, Index, Index) {
+      ABORT("PrepareBTransposed Unsupported");
+    }
+    static void SelectColumnsB(const Type *, Type *, Index, const Index *, const Index *) {
+      ABORT("SelectColumnsB Unsupported");
+    }
+
+    template <class Callback>
+    static void Multiply(const Type *A_prepared,
+                         const Type *B_prepared,
+                         float *output,
+                         Index rows_A,
+                         Index width,
+                         Index cols_B,
+                         Callback callback) {
+      ABORT("Multiply (A*B) Unsupported");
+    }
+  };
+
+  template <class T>
+  static T MaxAbsolute(const T *begin, const T *end) {
+    T result = 0;
+    for(auto p = begin; p < end; ++p) {
+      result = std::max(result, std::abs(*p));
+    }
+    return result;
+  }
+};
+
+}  // namespace integer
+}  // namespace cpu
+}  // namespace marian