diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index cee2d9ba..10341ba5 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -48,7 +48,7 @@ jobs: cmake -B ${{github.workspace}}/build \ -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \ -DCMAKE_CXX_COMPILER=clang++ \ - -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno" \ + -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -ftemplate-backtrace-limit=0" \ -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" \ -DBLAST_WITH_BENCHMARK=OFF \ -DBLAST_WITH_TEST=ON diff --git a/README.md b/README.md index cee8e2ff..67226b39 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,7 @@ # BLAST -*BLAST* is a high-performance linear algebra library that combines a BLAS-like nterface with modern C++ template metaprogramming. +*BLAST* (**BLAS** **T**emplates) is a high-performance linear algebra library that combines a BLAS-like interface with modern C++ template metaprogramming. *BLAST* implementation is single-threaded and intended for matrices of small and medium size (a few hundred rows/columns), which is common for embedded control applications. -The name stands for **BLAS** **T**emplates. - The figures below shows the performance of BLAS *dgemm* routine for different LA implementations on an *Intel(R) Core(TM) i7-9850H CPU @ 2.60GHz*: ![dgemm_performance](doc/dgemm_performance.png) @@ -104,4 +102,4 @@ cd blast docker build . --tag blast_bench . docker run -v `pwd`/bench_result/docker:/root/blast/bench_result blast_bench ``` -The benchmark results will be put in `/bench_result/docker`. \ No newline at end of file +The benchmark results will be put in `/bench_result/docker`. diff --git a/bench/blast/CMakeLists.txt b/bench/blast/CMakeLists.txt index 603a26e9..6d20751a 100644 --- a/bench/blast/CMakeLists.txt +++ b/bench/blast/CMakeLists.txt @@ -35,6 +35,12 @@ add_executable(bench-blast math/panel/DynamicPotrf.cpp ) +target_compile_definitions(bench-blast + # Use Blaze without linking to a BLAS library. + # Blaze is used to prepare data in some of the benchmarks. + PRIVATE BLAZE_BLAS_MODE=0 +) + target_link_libraries(bench-blast blast bench-blast-common diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp new file mode 100644 index 00000000..9521cb39 --- /dev/null +++ b/include/blast/math/algorithm/Tile.hpp @@ -0,0 +1,124 @@ +// Copyright 2023 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include + + +namespace blast +{ + /** + * @brief Cover a matrix with tiles of different sizes in a performance-efficient way. + * + * The tile sizes and positions are chosen based on matrix element type, storage order, and current system architecture. + * Positions and sizes of the tiles are such that the entire matrix is covered and tiles do not overlap. + * + * This function is helpful in implementing register-blocked matrix algorithms. + * + * For each tile one of the two specified functors @a f_full, @a f_partial is called: + * + * 1) @a f_full (ker, i, j); // if tile size equals ker.columns() by ker.rows() + * 2) @a f_partial (ker, i, j, km, kn); // if tile size is smaller than ker.columns() by ker.rows() + * + * where ker is a RegisterMatrix object, (i, j) are indices of top left corner of the tile, + * and (km, kn) are dimensions of the tile. + * + * @tparam ET type of matrix elements + * @tparam SO matrix storage order + * @tparam F functor type + * + * @param m number of matrix rows + * @param n number of matrix columns + * @param f_full functor to call on full tiles + * @param f_partial functor to call on partial tiles + */ + template + BLAST_ALWAYS_INLINE void tile(std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) + { + size_t constexpr TILE_SIZE = TileSize_v; + + size_t j = 0; + + // Main part + for (; j + TILE_SIZE <= n; j += TILE_SIZE) + { + size_t i = 0; + + // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: + // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. + for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + // Bottom side + if (i < m) + { + RegisterMatrix ker; + f_partial(ker, i, j, m - i, ker.columns()); + } + } + + + // Right side + if (j < n) + { + size_t i = 0; + + // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: + // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. + for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + // Bottom-right corner + if (i < m) + { + RegisterMatrix ker; + f_partial(ker, i, j, m - i, n - j); + } + } + } +} \ No newline at end of file diff --git a/include/blast/math/dense/Gemm.hpp b/include/blast/math/dense/Gemm.hpp index 60456fcf..a2cb87aa 100644 --- a/include/blast/math/dense/Gemm.hpp +++ b/include/blast/math/dense/Gemm.hpp @@ -4,18 +4,21 @@ #pragma once -#include -#include -#include -#include +#include +#include +#include +#include #include -#include +#include + +#include #include namespace blast { + /** * @brief Performs the matrix-matrix operation * @@ -51,83 +54,25 @@ namespace blast MatrixPointer && StorageOrder_v == columnMajor && MatrixPointer && StorageOrder_v == columnMajor ) - BLAZE_ALWAYS_INLINE void gemm(size_t M, size_t N, size_t K, ST1 alpha, MPA A, MPB B, ST2 beta, MPC C, MPD D) + BLAST_ALWAYS_INLINE void gemm(size_t M, size_t N, size_t K, ST1 alpha, MPA A, MPB B, ST2 beta, MPC C, MPD D) { - using ET = std::remove_cv_t>; + using ET = std::remove_cv_t>; size_t constexpr TILE_SIZE = TileSize_v; BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t>, ET); BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t>, ET); BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t>, ET); - size_t j = 0; - - // Main part - for (; j + TILE_SIZE <= N; j += TILE_SIZE) - { - size_t i = 0; - - // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: - // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. - for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j)); - } - - for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE) + tile)>(M, N, + [&] (auto& ker, size_t i, size_t j) { - RegisterMatrix ker; gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j)); - } - - for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j)); - } - - // Bottom edge - if (i < M) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), M - i, ker.columns()); - } - } - - - // Right edge - if (j < N) - { - size_t i = 0; - - // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: - // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. - for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j); - } - - for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j); - } - - for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j); - } - - // Bottom-right corner - if (i < M) + }, + [&] (auto& ker, size_t i, size_t j, size_t m, size_t n) { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), M - i, N - j); + gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), m, n); } - } + ); } diff --git a/include/blast/math/dense/Ger.hpp b/include/blast/math/dense/Ger.hpp index 9adecf9f..f7e02fd3 100644 --- a/include/blast/math/dense/Ger.hpp +++ b/include/blast/math/dense/Ger.hpp @@ -97,7 +97,7 @@ namespace blast ker.store(B(i, j)); } - // Bottom edge + // Bottom side if (i < M) { RegisterMatrix ker; @@ -108,7 +108,7 @@ namespace blast } - // Right edge + // Right side if (j < N) { size_t i = 0; diff --git a/include/blast/math/dense/Syrk.hpp b/include/blast/math/dense/Syrk.hpp index 9bac21c2..cafaf66a 100644 --- a/include/blast/math/dense/Syrk.hpp +++ b/include/blast/math/dense/Syrk.hpp @@ -77,7 +77,7 @@ namespace blast ker.store(ptr(D, i, j)); } - // Bottom edge + // Bottom side if (i < M) { RegisterMatrix ker; @@ -91,7 +91,7 @@ namespace blast } - // Right edge + // Right side if (j < M) { size_t i = j; diff --git a/include/blast/math/dense/Trmm.hpp b/include/blast/math/dense/Trmm.hpp index 46c7e9f0..041159e0 100644 --- a/include/blast/math/dense/Trmm.hpp +++ b/include/blast/math/dense/Trmm.hpp @@ -122,7 +122,7 @@ namespace blast ker.store(ptr(C, i, j)); } - // Bottom edge + // Bottom side if (i < M) { RegisterMatrix ker; @@ -136,7 +136,7 @@ namespace blast } - // Right edge + // Right side if (j < N) { size_t i = 0; diff --git a/include/blast/system/Inline.hpp b/include/blast/system/Inline.hpp new file mode 100644 index 00000000..8ec59251 --- /dev/null +++ b/include/blast/system/Inline.hpp @@ -0,0 +1,19 @@ +// Copyright 2023 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#define BLAST_ALWAYS_INLINE BLAZE_ALWAYS_INLINE