From d75b59c2eedc8bf5917acca7b903201f90eb704f Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 16 Mar 2023 20:25:56 +0300
Subject: [PATCH 1/7] Update README.md

---
 README.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index cee8e2ff..f3f0b703 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,7 @@
 # BLAST
-*BLAST* is a high-performance linear algebra library that combines a BLAS-like nterface with modern C++ template metaprogramming.
+*BLAST* (**BLAS** **T**emplates) is a high-performance linear algebra library that combines a BLAS-like nterface with modern C++ template metaprogramming.
 *BLAST* implementation is single-threaded and intended for matrices of small and medium size (a few hundred rows/columns), which is common for embedded control applications.
 
-The name stands for **BLAS** **T**emplates.
-
 The figures below shows the performance of BLAS *dgemm* routine for different LA implementations on an
 *Intel(R) Core(TM) i7-9850H CPU @ 2.60GHz*:
 ![dgemm_performance](doc/dgemm_performance.png)
@@ -104,4 +102,4 @@ cd blast
 docker build . --tag blast_bench .
 docker run -v `pwd`/bench_result/docker:/root/blast/bench_result blast_bench
 ```
-The benchmark results will be put in `/bench_result/docker`.
\ No newline at end of file
+The benchmark results will be put in `/bench_result/docker`.

From 3116abf85410818e8a8b9bab7d7ca4434173012c Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 16 Mar 2023 20:24:17 +0100
Subject: [PATCH 2/7] Added tile() function

---
 include/blast/math/algorithm/Tile.hpp | 122 ++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 include/blast/math/algorithm/Tile.hpp

diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp
new file mode 100644
index 00000000..4b652089
--- /dev/null
+++ b/include/blast/math/algorithm/Tile.hpp
@@ -0,0 +1,122 @@
+// Copyright 2023 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <blast/system/Tile.hpp>
+#include <blast/math/StorageOrder.hpp>
+#include <blast/math/simd/RegisterMatrix.hpp>
+
+#include <cstdlib>
+
+
+namespace blast
+{
+    /**
+     * @brief Cover a matrix with tiles of different sizes in a performance-efficient way.
+     *
+     * The tile sizes and positions are chosen based on matrix element type, storage order, and current system architecture.
+     * Positions and sizes of the tiles are such that the entire matrix is coveren and tiles do not overlap.
+     *
+     * This function is helpful in implementing register-blocked matrix algorithms.
+     *
+     * For each tile a specified functor @a func is called in one of the following ways:
+     *
+     * 1) func(ker, i, j);
+     * 2) func(ker, i, j, km, kn);
+     *
+     * Where ker is a RegisterMatrix object, (i, j) are indices of top left corner of the tile,
+     * and (km, kn) are dimensions of the tile.
+     *
+     * @tparam ET type of matrix elements
+     * @tparam SO matrix storage order
+     * @tparam F functor type
+     *
+     * @param m number of matrix rows
+     * @param n number of matrix columns
+     * @param func functor to call
+     */
+    template <typename ET, StorageOrder SO, typename F>
+    inline void tile(std::size_t m, std::size_t n, F&& func)
+    {
+        size_t constexpr TILE_SIZE = TileSize_v<ET>;
+
+        size_t j = 0;
+
+        // Main part
+        for (; j + TILE_SIZE <= n; j += TILE_SIZE)
+        {
+            size_t i = 0;
+
+            // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE:
+            // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel.
+            for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 3 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                func(ker, i, j);
+            }
+
+            for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 2 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                func(ker, i, j);
+            }
+
+            for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 1 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                func(ker, i, j);
+            }
+
+            // Bottom edge
+            if (i < m)
+            {
+                RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                func(ker, i, j, m - i, ker.columns());
+            }
+        }
+
+
+        // Right edge
+        if (j < n)
+        {
+            size_t i = 0;
+
+            // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE:
+            // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel.
+            for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 3 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                func(ker, i, j, ker.rows(), n - j);
+            }
+
+            for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 2 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                func(ker, i, j, ker.rows(), n - j);
+            }
+
+            for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 1 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                func(ker, i, j, ker.rows(), n - j);
+            }
+
+            // Bottom-right corner
+            if (i < m)
+            {
+                RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                func(ker, i, j, m - i, n - j);
+            }
+        }
+    }
+}
\ No newline at end of file

From e33d7b40349536b66a33ee5cc6142d5cdb7f09fa Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 16 Mar 2023 20:44:57 +0100
Subject: [PATCH 3/7] gemm() using tile()

---
 include/blast/math/algorithm/Tile.hpp | 32 +++++-----
 include/blast/math/dense/Gemm.hpp     | 87 +++++----------------------
 include/blast/system/Inline.hpp       | 19 ++++++
 3 files changed, 52 insertions(+), 86 deletions(-)
 create mode 100644 include/blast/system/Inline.hpp

diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp
index 4b652089..9339a56b 100644
--- a/include/blast/math/algorithm/Tile.hpp
+++ b/include/blast/math/algorithm/Tile.hpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <blast/system/Tile.hpp>
+#include <blast/system/Inline.hpp>
 #include <blast/math/StorageOrder.hpp>
 #include <blast/math/simd/RegisterMatrix.hpp>
 
@@ -29,12 +30,12 @@ namespace blast
      *
      * This function is helpful in implementing register-blocked matrix algorithms.
      *
-     * For each tile a specified functor @a func is called in one of the following ways:
+     * For each tile one of the two specified functors @a f_full, @a f_partial is called:
      *
-     * 1) func(ker, i, j);
-     * 2) func(ker, i, j, km, kn);
+     * 1) @a f_full (ker, i, j);  // if tile size equals ker.columns() by ker.rows()
+     * 2) @a f_partial (ker, i, j, km, kn);  // if tile size is smaller than ker.columns() by ker.rows()
      *
-     * Where ker is a RegisterMatrix object, (i, j) are indices of top left corner of the tile,
+     * where ker is a RegisterMatrix object, (i, j) are indices of top left corner of the tile,
      * and (km, kn) are dimensions of the tile.
      *
      * @tparam ET type of matrix elements
@@ -43,10 +44,11 @@ namespace blast
      *
      * @param m number of matrix rows
      * @param n number of matrix columns
-     * @param func functor to call
+     * @param f_full functor to call on full tiles
+     * @param f_partial functor to call on partial tiles
      */
-    template <typename ET, StorageOrder SO, typename F>
-    inline void tile(std::size_t m, std::size_t n, F&& func)
+    template <typename ET, StorageOrder SO, typename FF, typename FP>
+    BLAST_ALWAYS_INLINE void tile(std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial)
     {
         size_t constexpr TILE_SIZE = TileSize_v<ET>;
 
@@ -62,26 +64,26 @@ namespace blast
             for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE)
             {
                 RegisterMatrix<ET, 3 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                func(ker, i, j);
+                f_full(ker, i, j);
             }
 
             for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE)
             {
                 RegisterMatrix<ET, 2 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                func(ker, i, j);
+                f_full(ker, i, j);
             }
 
             for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE)
             {
                 RegisterMatrix<ET, 1 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                func(ker, i, j);
+                f_full(ker, i, j);
             }
 
             // Bottom edge
             if (i < m)
             {
                 RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                func(ker, i, j, m - i, ker.columns());
+                f_partial(ker, i, j, m - i, ker.columns());
             }
         }
 
@@ -96,26 +98,26 @@ namespace blast
             for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE)
             {
                 RegisterMatrix<ET, 3 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                func(ker, i, j, ker.rows(), n - j);
+                f_partial(ker, i, j, ker.rows(), n - j);
             }
 
             for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE)
             {
                 RegisterMatrix<ET, 2 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                func(ker, i, j, ker.rows(), n - j);
+                f_partial(ker, i, j, ker.rows(), n - j);
             }
 
             for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE)
             {
                 RegisterMatrix<ET, 1 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                func(ker, i, j, ker.rows(), n - j);
+                f_partial(ker, i, j, ker.rows(), n - j);
             }
 
             // Bottom-right corner
             if (i < m)
             {
                 RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                func(ker, i, j, m - i, n - j);
+                f_partial(ker, i, j, m - i, n - j);
             }
         }
     }
diff --git a/include/blast/math/dense/Gemm.hpp b/include/blast/math/dense/Gemm.hpp
index 60456fcf..a2cb87aa 100644
--- a/include/blast/math/dense/Gemm.hpp
+++ b/include/blast/math/dense/Gemm.hpp
@@ -4,18 +4,21 @@
 
 #pragma once
 
-#include <blaze/math/StorageOrder.h>
-#include <blaze/math/typetraits/StorageOrder.h>
-#include <blaze/math/views/Forward.h>
-#include <blast/math/dense/GemmBackend.hpp>
+#include <blast/system/Inline.hpp>
+#include <blast/math/typetraits/StorageOrder.hpp>
+#include <blast/math/typetraits/MatrixPointer.hpp>
+#include <blast/math/algorithm/Tile.hpp>
 #include <blast/math/dense/MatrixPointer.hpp>
 
-#include <algorithm>
+#include <blaze/util/constraints/SameType.h>
+
+#include <cstddef>
 #include <type_traits>
 
 
 namespace blast
 {
+
     /**
      * @brief Performs the matrix-matrix operation
      *
@@ -51,83 +54,25 @@ namespace blast
         MatrixPointer<MPC> && StorageOrder_v<MPC> == columnMajor &&
         MatrixPointer<MPD> && StorageOrder_v<MPD> == columnMajor
     )
-    BLAZE_ALWAYS_INLINE void gemm(size_t M, size_t N, size_t K, ST1 alpha, MPA A, MPB B, ST2 beta, MPC C, MPD D)
+    BLAST_ALWAYS_INLINE void gemm(size_t M, size_t N, size_t K, ST1 alpha, MPA A, MPB B, ST2 beta, MPC C, MPD D)
     {
-        using ET = std::remove_cv_t<ElementType_t<MPA>>;
+        using ET = std::remove_cv_t<ElementType_t<MPD>>;
         size_t constexpr TILE_SIZE = TileSize_v<ET>;
 
         BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t<ElementType_t<MPB>>, ET);
         BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t<ElementType_t<MPC>>, ET);
         BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t<ElementType_t<MPD>>, ET);
 
-        size_t j = 0;
-
-        // Main part
-        for (; j + TILE_SIZE <= N; j += TILE_SIZE)
-        {
-            size_t i = 0;
-
-            // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE:
-            // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel.
-            for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE)
-            {
-                RegisterMatrix<ET, 3 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j));
-            }
-
-            for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE)
+        tile<ET, StorageOrder(StorageOrder_v<MPD>)>(M, N,
+            [&] (auto& ker, size_t i, size_t j)
             {
-                RegisterMatrix<ET, 2 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
                 gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j));
-            }
-
-            for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE)
-            {
-                RegisterMatrix<ET, 1 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j));
-            }
-
-            // Bottom edge
-            if (i < M)
-            {
-                RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), M - i, ker.columns());
-            }
-        }
-
-
-        // Right edge
-        if (j < N)
-        {
-            size_t i = 0;
-
-            // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE:
-            // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel.
-            for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE)
-            {
-                RegisterMatrix<ET, 3 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j);
-            }
-
-            for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE)
-            {
-                RegisterMatrix<ET, 2 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j);
-            }
-
-            for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE)
-            {
-                RegisterMatrix<ET, 1 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j);
-            }
-
-            // Bottom-right corner
-            if (i < M)
+            },
+            [&] (auto& ker, size_t i, size_t j, size_t m, size_t n)
             {
-                RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), M - i, N - j);
+                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), m, n);
             }
-        }
+        );
     }
 
 
diff --git a/include/blast/system/Inline.hpp b/include/blast/system/Inline.hpp
new file mode 100644
index 00000000..8ec59251
--- /dev/null
+++ b/include/blast/system/Inline.hpp
@@ -0,0 +1,19 @@
+// Copyright 2023 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <blaze/system/Inline.h>
+
+#define BLAST_ALWAYS_INLINE BLAZE_ALWAYS_INLINE

From c788324334ce50078c657f22ff41ef0a43dd2dd4 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 1 Feb 2024 16:47:29 +0100
Subject: [PATCH 4/7] Add BLAZE_BLAS_MODE=0 in bench-blast

---
 bench/blast/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/bench/blast/CMakeLists.txt b/bench/blast/CMakeLists.txt
index 603a26e9..6d20751a 100644
--- a/bench/blast/CMakeLists.txt
+++ b/bench/blast/CMakeLists.txt
@@ -35,6 +35,12 @@ add_executable(bench-blast
     math/panel/DynamicPotrf.cpp
 )
 
+target_compile_definitions(bench-blast
+    # Use Blaze without linking to a BLAS library.
+    # Blaze is used to prepare data in some of the benchmarks.
+    PRIVATE BLAZE_BLAS_MODE=0
+)
+
 target_link_libraries(bench-blast
     blast
     bench-blast-common

From 6fd9c950c5fe3ed49e5ae4a9e0f524cb69e20aee Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Thu, 1 Feb 2024 17:24:15 +0100
Subject: [PATCH 5/7] edge -> side

---
 include/blast/math/algorithm/Tile.hpp | 4 ++--
 include/blast/math/dense/Ger.hpp      | 4 ++--
 include/blast/math/dense/Syrk.hpp     | 4 ++--
 include/blast/math/dense/Trmm.hpp     | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp
index 9339a56b..c5ff2163 100644
--- a/include/blast/math/algorithm/Tile.hpp
+++ b/include/blast/math/algorithm/Tile.hpp
@@ -79,7 +79,7 @@ namespace blast
                 f_full(ker, i, j);
             }
 
-            // Bottom edge
+            // Bottom side
             if (i < m)
             {
                 RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
@@ -88,7 +88,7 @@ namespace blast
         }
 
 
-        // Right edge
+        // Right side
         if (j < n)
         {
             size_t i = 0;
diff --git a/include/blast/math/dense/Ger.hpp b/include/blast/math/dense/Ger.hpp
index 9adecf9f..f7e02fd3 100644
--- a/include/blast/math/dense/Ger.hpp
+++ b/include/blast/math/dense/Ger.hpp
@@ -97,7 +97,7 @@ namespace blast
                 ker.store(B(i, j));
             }
 
-            // Bottom edge
+            // Bottom side
             if (i < M)
             {
                 RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
@@ -108,7 +108,7 @@ namespace blast
         }
 
 
-        // Right edge
+        // Right side
         if (j < N)
         {
             size_t i = 0;
diff --git a/include/blast/math/dense/Syrk.hpp b/include/blast/math/dense/Syrk.hpp
index 9bac21c2..cafaf66a 100644
--- a/include/blast/math/dense/Syrk.hpp
+++ b/include/blast/math/dense/Syrk.hpp
@@ -77,7 +77,7 @@ namespace blast
                     ker.store(ptr<aligned>(D, i, j));
             }
 
-            // Bottom edge
+            // Bottom side
             if (i < M)
             {
                 RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
@@ -91,7 +91,7 @@ namespace blast
         }
 
 
-        // Right edge
+        // Right side
         if (j < M)
         {
             size_t i = j;
diff --git a/include/blast/math/dense/Trmm.hpp b/include/blast/math/dense/Trmm.hpp
index 46c7e9f0..041159e0 100644
--- a/include/blast/math/dense/Trmm.hpp
+++ b/include/blast/math/dense/Trmm.hpp
@@ -122,7 +122,7 @@ namespace blast
                 ker.store(ptr<aligned>(C, i, j));
             }
 
-            // Bottom edge
+            // Bottom side
             if (i < M)
             {
                 RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
@@ -136,7 +136,7 @@ namespace blast
         }
 
 
-        // Right edge
+        // Right side
         if (j < N)
         {
             size_t i = 0;

From 0b79d8b922abd244d03aff82928fe4f6e80516d5 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Tue, 13 Feb 2024 20:20:01 +0100
Subject: [PATCH 6/7] Fixed spelling errors

---
 README.md                             | 2 +-
 include/blast/math/algorithm/Tile.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f3f0b703..67226b39 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # BLAST
-*BLAST* (**BLAS** **T**emplates) is a high-performance linear algebra library that combines a BLAS-like nterface with modern C++ template metaprogramming.
+*BLAST* (**BLAS** **T**emplates) is a high-performance linear algebra library that combines a BLAS-like interface with modern C++ template metaprogramming.
 *BLAST* implementation is single-threaded and intended for matrices of small and medium size (a few hundred rows/columns), which is common for embedded control applications.
 
 The figures below shows the performance of BLAS *dgemm* routine for different LA implementations on an
diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp
index c5ff2163..9521cb39 100644
--- a/include/blast/math/algorithm/Tile.hpp
+++ b/include/blast/math/algorithm/Tile.hpp
@@ -26,7 +26,7 @@ namespace blast
      * @brief Cover a matrix with tiles of different sizes in a performance-efficient way.
      *
      * The tile sizes and positions are chosen based on matrix element type, storage order, and current system architecture.
-     * Positions and sizes of the tiles are such that the entire matrix is coveren and tiles do not overlap.
+     * Positions and sizes of the tiles are such that the entire matrix is covered and tiles do not overlap.
      *
      * This function is helpful in implementing register-blocked matrix algorithms.
      *

From 391e92a9cad3836fe7e9ec18bb3162f4190a7191 Mon Sep 17 00:00:00 2001
From: Mikhail Katliar <cepstr@gmail.com>
Date: Tue, 13 Feb 2024 20:54:29 +0100
Subject: [PATCH 7/7] Added -ftemplate-backtrace-limit=0 in cmake.yml workflow
 for better diagnostics

---
 .github/workflows/cmake.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index cee2d9ba..10341ba5 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -48,7 +48,7 @@ jobs:
         cmake -B ${{github.workspace}}/build \
         -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \
         -DCMAKE_CXX_COMPILER=clang++ \
-        -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno" \
+        -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -ftemplate-backtrace-limit=0" \
         -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" \
         -DBLAST_WITH_BENCHMARK=OFF \
         -DBLAST_WITH_TEST=ON