From d75b59c2eedc8bf5917acca7b903201f90eb704f Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 16 Mar 2023 20:25:56 +0300 Subject: [PATCH 1/7] Update README.md --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index cee8e2ff..f3f0b703 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,7 @@ # BLAST -*BLAST* is a high-performance linear algebra library that combines a BLAS-like nterface with modern C++ template metaprogramming. +*BLAST* (**BLAS** **T**emplates) is a high-performance linear algebra library that combines a BLAS-like nterface with modern C++ template metaprogramming. *BLAST* implementation is single-threaded and intended for matrices of small and medium size (a few hundred rows/columns), which is common for embedded control applications. -The name stands for **BLAS** **T**emplates. - The figures below shows the performance of BLAS *dgemm* routine for different LA implementations on an *Intel(R) Core(TM) i7-9850H CPU @ 2.60GHz*: ![dgemm_performance](doc/dgemm_performance.png) @@ -104,4 +102,4 @@ cd blast docker build . --tag blast_bench . docker run -v `pwd`/bench_result/docker:/root/blast/bench_result blast_bench ``` -The benchmark results will be put in `/bench_result/docker`. \ No newline at end of file +The benchmark results will be put in `/bench_result/docker`. From 3116abf85410818e8a8b9bab7d7ca4434173012c Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 16 Mar 2023 20:24:17 +0100 Subject: [PATCH 2/7] Added tile() function --- include/blast/math/algorithm/Tile.hpp | 122 ++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 include/blast/math/algorithm/Tile.hpp diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp new file mode 100644 index 00000000..4b652089 --- /dev/null +++ b/include/blast/math/algorithm/Tile.hpp @@ -0,0 +1,122 @@ +// Copyright 2023 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include + + +namespace blast +{ + /** + * @brief Cover a matrix with tiles of different sizes in a performance-efficient way. + * + * The tile sizes and positions are chosen based on matrix element type, storage order, and current system architecture. + * Positions and sizes of the tiles are such that the entire matrix is coveren and tiles do not overlap. + * + * This function is helpful in implementing register-blocked matrix algorithms. + * + * For each tile a specified functor @a func is called in one of the following ways: + * + * 1) func(ker, i, j); + * 2) func(ker, i, j, km, kn); + * + * Where ker is a RegisterMatrix object, (i, j) are indices of top left corner of the tile, + * and (km, kn) are dimensions of the tile. + * + * @tparam ET type of matrix elements + * @tparam SO matrix storage order + * @tparam F functor type + * + * @param m number of matrix rows + * @param n number of matrix columns + * @param func functor to call + */ + template + inline void tile(std::size_t m, std::size_t n, F&& func) + { + size_t constexpr TILE_SIZE = TileSize_v; + + size_t j = 0; + + // Main part + for (; j + TILE_SIZE <= n; j += TILE_SIZE) + { + size_t i = 0; + + // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: + // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. + for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE) + { + RegisterMatrix ker; + func(ker, i, j); + } + + for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE) + { + RegisterMatrix ker; + func(ker, i, j); + } + + for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE) + { + RegisterMatrix ker; + func(ker, i, j); + } + + // Bottom edge + if (i < m) + { + RegisterMatrix ker; + func(ker, i, j, m - i, ker.columns()); + } + } + + + // Right edge + if (j < n) + { + size_t i = 0; + + // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: + // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. + for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE) + { + RegisterMatrix ker; + func(ker, i, j, ker.rows(), n - j); + } + + for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE) + { + RegisterMatrix ker; + func(ker, i, j, ker.rows(), n - j); + } + + for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE) + { + RegisterMatrix ker; + func(ker, i, j, ker.rows(), n - j); + } + + // Bottom-right corner + if (i < m) + { + RegisterMatrix ker; + func(ker, i, j, m - i, n - j); + } + } + } +} \ No newline at end of file From e33d7b40349536b66a33ee5cc6142d5cdb7f09fa Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 16 Mar 2023 20:44:57 +0100 Subject: [PATCH 3/7] gemm() using tile() --- include/blast/math/algorithm/Tile.hpp | 32 +++++----- include/blast/math/dense/Gemm.hpp | 87 +++++---------------------- include/blast/system/Inline.hpp | 19 ++++++ 3 files changed, 52 insertions(+), 86 deletions(-) create mode 100644 include/blast/system/Inline.hpp diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp index 4b652089..9339a56b 100644 --- a/include/blast/math/algorithm/Tile.hpp +++ b/include/blast/math/algorithm/Tile.hpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -29,12 +30,12 @@ namespace blast * * This function is helpful in implementing register-blocked matrix algorithms. * - * For each tile a specified functor @a func is called in one of the following ways: + * For each tile one of the two specified functors @a f_full, @a f_partial is called: * - * 1) func(ker, i, j); - * 2) func(ker, i, j, km, kn); + * 1) @a f_full (ker, i, j); // if tile size equals ker.columns() by ker.rows() + * 2) @a f_partial (ker, i, j, km, kn); // if tile size is smaller than ker.columns() by ker.rows() * - * Where ker is a RegisterMatrix object, (i, j) are indices of top left corner of the tile, + * where ker is a RegisterMatrix object, (i, j) are indices of top left corner of the tile, * and (km, kn) are dimensions of the tile. * * @tparam ET type of matrix elements @@ -43,10 +44,11 @@ namespace blast * * @param m number of matrix rows * @param n number of matrix columns - * @param func functor to call + * @param f_full functor to call on full tiles + * @param f_partial functor to call on partial tiles */ - template - inline void tile(std::size_t m, std::size_t n, F&& func) + template + BLAST_ALWAYS_INLINE void tile(std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) { size_t constexpr TILE_SIZE = TileSize_v; @@ -62,26 +64,26 @@ namespace blast for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE) { RegisterMatrix ker; - func(ker, i, j); + f_full(ker, i, j); } for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE) { RegisterMatrix ker; - func(ker, i, j); + f_full(ker, i, j); } for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE) { RegisterMatrix ker; - func(ker, i, j); + f_full(ker, i, j); } // Bottom edge if (i < m) { RegisterMatrix ker; - func(ker, i, j, m - i, ker.columns()); + f_partial(ker, i, j, m - i, ker.columns()); } } @@ -96,26 +98,26 @@ namespace blast for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE) { RegisterMatrix ker; - func(ker, i, j, ker.rows(), n - j); + f_partial(ker, i, j, ker.rows(), n - j); } for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE) { RegisterMatrix ker; - func(ker, i, j, ker.rows(), n - j); + f_partial(ker, i, j, ker.rows(), n - j); } for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE) { RegisterMatrix ker; - func(ker, i, j, ker.rows(), n - j); + f_partial(ker, i, j, ker.rows(), n - j); } // Bottom-right corner if (i < m) { RegisterMatrix ker; - func(ker, i, j, m - i, n - j); + f_partial(ker, i, j, m - i, n - j); } } } diff --git a/include/blast/math/dense/Gemm.hpp b/include/blast/math/dense/Gemm.hpp index 60456fcf..a2cb87aa 100644 --- a/include/blast/math/dense/Gemm.hpp +++ b/include/blast/math/dense/Gemm.hpp @@ -4,18 +4,21 @@ #pragma once -#include -#include -#include -#include +#include +#include +#include +#include #include -#include +#include + +#include #include namespace blast { + /** * @brief Performs the matrix-matrix operation * @@ -51,83 +54,25 @@ namespace blast MatrixPointer && StorageOrder_v == columnMajor && MatrixPointer && StorageOrder_v == columnMajor ) - BLAZE_ALWAYS_INLINE void gemm(size_t M, size_t N, size_t K, ST1 alpha, MPA A, MPB B, ST2 beta, MPC C, MPD D) + BLAST_ALWAYS_INLINE void gemm(size_t M, size_t N, size_t K, ST1 alpha, MPA A, MPB B, ST2 beta, MPC C, MPD D) { - using ET = std::remove_cv_t>; + using ET = std::remove_cv_t>; size_t constexpr TILE_SIZE = TileSize_v; BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t>, ET); BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t>, ET); BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t>, ET); - size_t j = 0; - - // Main part - for (; j + TILE_SIZE <= N; j += TILE_SIZE) - { - size_t i = 0; - - // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: - // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. - for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j)); - } - - for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE) + tile)>(M, N, + [&] (auto& ker, size_t i, size_t j) { - RegisterMatrix ker; gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j)); - } - - for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j)); - } - - // Bottom edge - if (i < M) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), M - i, ker.columns()); - } - } - - - // Right edge - if (j < N) - { - size_t i = 0; - - // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: - // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. - for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j); - } - - for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j); - } - - for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE) - { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j); - } - - // Bottom-right corner - if (i < M) + }, + [&] (auto& ker, size_t i, size_t j, size_t m, size_t n) { - RegisterMatrix ker; - gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), M - i, N - j); + gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), m, n); } - } + ); } diff --git a/include/blast/system/Inline.hpp b/include/blast/system/Inline.hpp new file mode 100644 index 00000000..8ec59251 --- /dev/null +++ b/include/blast/system/Inline.hpp @@ -0,0 +1,19 @@ +// Copyright 2023 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#define BLAST_ALWAYS_INLINE BLAZE_ALWAYS_INLINE From c788324334ce50078c657f22ff41ef0a43dd2dd4 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 1 Feb 2024 16:47:29 +0100 Subject: [PATCH 4/7] Add BLAZE_BLAS_MODE=0 in bench-blast --- bench/blast/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bench/blast/CMakeLists.txt b/bench/blast/CMakeLists.txt index 603a26e9..6d20751a 100644 --- a/bench/blast/CMakeLists.txt +++ b/bench/blast/CMakeLists.txt @@ -35,6 +35,12 @@ add_executable(bench-blast math/panel/DynamicPotrf.cpp ) +target_compile_definitions(bench-blast + # Use Blaze without linking to a BLAS library. + # Blaze is used to prepare data in some of the benchmarks. + PRIVATE BLAZE_BLAS_MODE=0 +) + target_link_libraries(bench-blast blast bench-blast-common From 6fd9c950c5fe3ed49e5ae4a9e0f524cb69e20aee Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Thu, 1 Feb 2024 17:24:15 +0100 Subject: [PATCH 5/7] edge -> side --- include/blast/math/algorithm/Tile.hpp | 4 ++-- include/blast/math/dense/Ger.hpp | 4 ++-- include/blast/math/dense/Syrk.hpp | 4 ++-- include/blast/math/dense/Trmm.hpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp index 9339a56b..c5ff2163 100644 --- a/include/blast/math/algorithm/Tile.hpp +++ b/include/blast/math/algorithm/Tile.hpp @@ -79,7 +79,7 @@ namespace blast f_full(ker, i, j); } - // Bottom edge + // Bottom side if (i < m) { RegisterMatrix ker; @@ -88,7 +88,7 @@ namespace blast } - // Right edge + // Right side if (j < n) { size_t i = 0; diff --git a/include/blast/math/dense/Ger.hpp b/include/blast/math/dense/Ger.hpp index 9adecf9f..f7e02fd3 100644 --- a/include/blast/math/dense/Ger.hpp +++ b/include/blast/math/dense/Ger.hpp @@ -97,7 +97,7 @@ namespace blast ker.store(B(i, j)); } - // Bottom edge + // Bottom side if (i < M) { RegisterMatrix ker; @@ -108,7 +108,7 @@ namespace blast } - // Right edge + // Right side if (j < N) { size_t i = 0; diff --git a/include/blast/math/dense/Syrk.hpp b/include/blast/math/dense/Syrk.hpp index 9bac21c2..cafaf66a 100644 --- a/include/blast/math/dense/Syrk.hpp +++ b/include/blast/math/dense/Syrk.hpp @@ -77,7 +77,7 @@ namespace blast ker.store(ptr(D, i, j)); } - // Bottom edge + // Bottom side if (i < M) { RegisterMatrix ker; @@ -91,7 +91,7 @@ namespace blast } - // Right edge + // Right side if (j < M) { size_t i = j; diff --git a/include/blast/math/dense/Trmm.hpp b/include/blast/math/dense/Trmm.hpp index 46c7e9f0..041159e0 100644 --- a/include/blast/math/dense/Trmm.hpp +++ b/include/blast/math/dense/Trmm.hpp @@ -122,7 +122,7 @@ namespace blast ker.store(ptr(C, i, j)); } - // Bottom edge + // Bottom side if (i < M) { RegisterMatrix ker; @@ -136,7 +136,7 @@ namespace blast } - // Right edge + // Right side if (j < N) { size_t i = 0; From 0b79d8b922abd244d03aff82928fe4f6e80516d5 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Tue, 13 Feb 2024 20:20:01 +0100 Subject: [PATCH 6/7] Fixed spelling errors --- README.md | 2 +- include/blast/math/algorithm/Tile.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f3f0b703..67226b39 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # BLAST -*BLAST* (**BLAS** **T**emplates) is a high-performance linear algebra library that combines a BLAS-like nterface with modern C++ template metaprogramming. +*BLAST* (**BLAS** **T**emplates) is a high-performance linear algebra library that combines a BLAS-like interface with modern C++ template metaprogramming. *BLAST* implementation is single-threaded and intended for matrices of small and medium size (a few hundred rows/columns), which is common for embedded control applications. The figures below shows the performance of BLAS *dgemm* routine for different LA implementations on an diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp index c5ff2163..9521cb39 100644 --- a/include/blast/math/algorithm/Tile.hpp +++ b/include/blast/math/algorithm/Tile.hpp @@ -26,7 +26,7 @@ namespace blast * @brief Cover a matrix with tiles of different sizes in a performance-efficient way. * * The tile sizes and positions are chosen based on matrix element type, storage order, and current system architecture. - * Positions and sizes of the tiles are such that the entire matrix is coveren and tiles do not overlap. + * Positions and sizes of the tiles are such that the entire matrix is covered and tiles do not overlap. * * This function is helpful in implementing register-blocked matrix algorithms. * From 391e92a9cad3836fe7e9ec18bb3162f4190a7191 Mon Sep 17 00:00:00 2001 From: Mikhail Katliar Date: Tue, 13 Feb 2024 20:54:29 +0100 Subject: [PATCH 7/7] Added -ftemplate-backtrace-limit=0 in cmake.yml workflow for better diagnostics --- .github/workflows/cmake.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index cee2d9ba..10341ba5 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -48,7 +48,7 @@ jobs: cmake -B ${{github.workspace}}/build \ -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \ -DCMAKE_CXX_COMPILER=clang++ \ - -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno" \ + -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -ftemplate-backtrace-limit=0" \ -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" \ -DBLAST_WITH_BENCHMARK=OFF \ -DBLAST_WITH_TEST=ON