diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index cee2d9ba..10341ba5 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -48,7 +48,7 @@ jobs:
         cmake -B ${{github.workspace}}/build \
         -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \
         -DCMAKE_CXX_COMPILER=clang++ \
-        -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno" \
+        -DCMAKE_CXX_FLAGS="-march=native -mfma -mavx -mavx2 -msse4 -fno-math-errno -ftemplate-backtrace-limit=0" \
         -DCMAKE_CXX_FLAGS_RELEASE="-O3 -g -DNDEBUG -ffast-math" \
         -DBLAST_WITH_BENCHMARK=OFF \
         -DBLAST_WITH_TEST=ON
diff --git a/README.md b/README.md
index cee8e2ff..67226b39 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,7 @@
 # BLAST
-*BLAST* is a high-performance linear algebra library that combines a BLAS-like nterface with modern C++ template metaprogramming.
+*BLAST* (**BLAS** **T**emplates) is a high-performance linear algebra library that combines a BLAS-like interface with modern C++ template metaprogramming.
 *BLAST* implementation is single-threaded and intended for matrices of small and medium size (a few hundred rows/columns), which is common for embedded control applications.
 
-The name stands for **BLAS** **T**emplates.
-
 The figures below shows the performance of BLAS *dgemm* routine for different LA implementations on an
 *Intel(R) Core(TM) i7-9850H CPU @ 2.60GHz*:
 ![dgemm_performance](doc/dgemm_performance.png)
@@ -104,4 +102,4 @@ cd blast
 docker build . --tag blast_bench .
 docker run -v `pwd`/bench_result/docker:/root/blast/bench_result blast_bench
 ```
-The benchmark results will be put in `/bench_result/docker`.
\ No newline at end of file
+The benchmark results will be put in `/bench_result/docker`.
diff --git a/bench/blast/CMakeLists.txt b/bench/blast/CMakeLists.txt
index 603a26e9..6d20751a 100644
--- a/bench/blast/CMakeLists.txt
+++ b/bench/blast/CMakeLists.txt
@@ -35,6 +35,12 @@ add_executable(bench-blast
     math/panel/DynamicPotrf.cpp
 )
 
+target_compile_definitions(bench-blast
+    # Use Blaze without linking to a BLAS library.
+    # Blaze is used to prepare data in some of the benchmarks.
+    PRIVATE BLAZE_BLAS_MODE=0
+)
+
 target_link_libraries(bench-blast
     blast
     bench-blast-common
diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp
new file mode 100644
index 00000000..9521cb39
--- /dev/null
+++ b/include/blast/math/algorithm/Tile.hpp
@@ -0,0 +1,124 @@
+// Copyright 2023 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <blast/system/Tile.hpp>
+#include <blast/system/Inline.hpp>
+#include <blast/math/StorageOrder.hpp>
+#include <blast/math/simd/RegisterMatrix.hpp>
+
+#include <cstdlib>
+
+
+namespace blast
+{
+    /**
+     * @brief Cover a matrix with tiles of different sizes in a performance-efficient way.
+     *
+     * The tile sizes and positions are chosen based on matrix element type, storage order, and current system architecture.
+     * Positions and sizes of the tiles are such that the entire matrix is covered and tiles do not overlap.
+     *
+     * This function is helpful in implementing register-blocked matrix algorithms.
+     *
+     * For each tile one of the two specified functors @a f_full, @a f_partial is called:
+     *
+     * 1) @a f_full (ker, i, j);  // if tile size equals ker.columns() by ker.rows()
+     * 2) @a f_partial (ker, i, j, km, kn);  // if tile size is smaller than ker.columns() by ker.rows()
+     *
+     * where ker is a RegisterMatrix object, (i, j) are indices of top left corner of the tile,
+     * and (km, kn) are dimensions of the tile.
+     *
+     * @tparam ET type of matrix elements
+     * @tparam SO matrix storage order
+     * @tparam F functor type
+     *
+     * @param m number of matrix rows
+     * @param n number of matrix columns
+     * @param f_full functor to call on full tiles
+     * @param f_partial functor to call on partial tiles
+     */
+    template <typename ET, StorageOrder SO, typename FF, typename FP>
+    BLAST_ALWAYS_INLINE void tile(std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial)
+    {
+        size_t constexpr TILE_SIZE = TileSize_v<ET>;
+
+        size_t j = 0;
+
+        // Main part
+        for (; j + TILE_SIZE <= n; j += TILE_SIZE)
+        {
+            size_t i = 0;
+
+            // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE:
+            // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel.
+            for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 3 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                f_full(ker, i, j);
+            }
+
+            for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 2 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                f_full(ker, i, j);
+            }
+
+            for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 1 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                f_full(ker, i, j);
+            }
+
+            // Bottom side
+            if (i < m)
+            {
+                RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                f_partial(ker, i, j, m - i, ker.columns());
+            }
+        }
+
+
+        // Right side
+        if (j < n)
+        {
+            size_t i = 0;
+
+            // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE:
+            // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel.
+            for (; i + 3 * TILE_SIZE <= m && i + 4 * TILE_SIZE != m; i += 3 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 3 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                f_partial(ker, i, j, ker.rows(), n - j);
+            }
+
+            for (; i + 2 * TILE_SIZE <= m; i += 2 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 2 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                f_partial(ker, i, j, ker.rows(), n - j);
+            }
+
+            for (; i + 1 * TILE_SIZE <= m; i += 1 * TILE_SIZE)
+            {
+                RegisterMatrix<ET, 1 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                f_partial(ker, i, j, ker.rows(), n - j);
+            }
+
+            // Bottom-right corner
+            if (i < m)
+            {
+                RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
+                f_partial(ker, i, j, m - i, n - j);
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/include/blast/math/dense/Gemm.hpp b/include/blast/math/dense/Gemm.hpp
index 60456fcf..a2cb87aa 100644
--- a/include/blast/math/dense/Gemm.hpp
+++ b/include/blast/math/dense/Gemm.hpp
@@ -4,18 +4,21 @@
 
 #pragma once
 
-#include <blaze/math/StorageOrder.h>
-#include <blaze/math/typetraits/StorageOrder.h>
-#include <blaze/math/views/Forward.h>
-#include <blast/math/dense/GemmBackend.hpp>
+#include <blast/system/Inline.hpp>
+#include <blast/math/typetraits/StorageOrder.hpp>
+#include <blast/math/typetraits/MatrixPointer.hpp>
+#include <blast/math/algorithm/Tile.hpp>
 #include <blast/math/dense/MatrixPointer.hpp>
 
-#include <algorithm>
+#include <blaze/util/constraints/SameType.h>
+
+#include <cstddef>
 #include <type_traits>
 
 
 namespace blast
 {
+
     /**
      * @brief Performs the matrix-matrix operation
      *
@@ -51,83 +54,25 @@ namespace blast
         MatrixPointer<MPC> && StorageOrder_v<MPC> == columnMajor &&
         MatrixPointer<MPD> && StorageOrder_v<MPD> == columnMajor
     )
-    BLAZE_ALWAYS_INLINE void gemm(size_t M, size_t N, size_t K, ST1 alpha, MPA A, MPB B, ST2 beta, MPC C, MPD D)
+    BLAST_ALWAYS_INLINE void gemm(size_t M, size_t N, size_t K, ST1 alpha, MPA A, MPB B, ST2 beta, MPC C, MPD D)
     {
-        using ET = std::remove_cv_t<ElementType_t<MPA>>;
+        using ET = std::remove_cv_t<ElementType_t<MPD>>;
         size_t constexpr TILE_SIZE = TileSize_v<ET>;
 
         BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t<ElementType_t<MPB>>, ET);
         BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t<ElementType_t<MPC>>, ET);
         BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(std::remove_cv_t<ElementType_t<MPD>>, ET);
 
-        size_t j = 0;
-
-        // Main part
-        for (; j + TILE_SIZE <= N; j += TILE_SIZE)
-        {
-            size_t i = 0;
-
-            // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE:
-            // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel.
-            for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE)
-            {
-                RegisterMatrix<ET, 3 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j));
-            }
-
-            for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE)
+        tile<ET, StorageOrder(StorageOrder_v<MPD>)>(M, N,
+            [&] (auto& ker, size_t i, size_t j)
             {
-                RegisterMatrix<ET, 2 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
                 gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j));
-            }
-
-            for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE)
-            {
-                RegisterMatrix<ET, 1 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j));
-            }
-
-            // Bottom edge
-            if (i < M)
-            {
-                RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), M - i, ker.columns());
-            }
-        }
-
-
-        // Right edge
-        if (j < N)
-        {
-            size_t i = 0;
-
-            // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE:
-            // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel.
-            for (; i + 3 * TILE_SIZE <= M && i + 4 * TILE_SIZE != M; i += 3 * TILE_SIZE)
-            {
-                RegisterMatrix<ET, 3 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j);
-            }
-
-            for (; i + 2 * TILE_SIZE <= M; i += 2 * TILE_SIZE)
-            {
-                RegisterMatrix<ET, 2 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j);
-            }
-
-            for (; i + 1 * TILE_SIZE <= M; i += 1 * TILE_SIZE)
-            {
-                RegisterMatrix<ET, 1 * TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), ker.rows(), N - j);
-            }
-
-            // Bottom-right corner
-            if (i < M)
+            },
+            [&] (auto& ker, size_t i, size_t j, size_t m, size_t n)
             {
-                RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
-                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), M - i, N - j);
+                gemm(ker, K, alpha, A(i, 0), B(0, j), beta, C(i, j), D(i, j), m, n);
             }
-        }
+        );
     }
 
 
diff --git a/include/blast/math/dense/Ger.hpp b/include/blast/math/dense/Ger.hpp
index 9adecf9f..f7e02fd3 100644
--- a/include/blast/math/dense/Ger.hpp
+++ b/include/blast/math/dense/Ger.hpp
@@ -97,7 +97,7 @@ namespace blast
                 ker.store(B(i, j));
             }
 
-            // Bottom edge
+            // Bottom side
             if (i < M)
             {
                 RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
@@ -108,7 +108,7 @@ namespace blast
         }
 
 
-        // Right edge
+        // Right side
         if (j < N)
         {
             size_t i = 0;
diff --git a/include/blast/math/dense/Syrk.hpp b/include/blast/math/dense/Syrk.hpp
index 9bac21c2..cafaf66a 100644
--- a/include/blast/math/dense/Syrk.hpp
+++ b/include/blast/math/dense/Syrk.hpp
@@ -77,7 +77,7 @@ namespace blast
                     ker.store(ptr<aligned>(D, i, j));
             }
 
-            // Bottom edge
+            // Bottom side
             if (i < M)
             {
                 RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
@@ -91,7 +91,7 @@ namespace blast
         }
 
 
-        // Right edge
+        // Right side
         if (j < M)
         {
             size_t i = j;
diff --git a/include/blast/math/dense/Trmm.hpp b/include/blast/math/dense/Trmm.hpp
index 46c7e9f0..041159e0 100644
--- a/include/blast/math/dense/Trmm.hpp
+++ b/include/blast/math/dense/Trmm.hpp
@@ -122,7 +122,7 @@ namespace blast
                 ker.store(ptr<aligned>(C, i, j));
             }
 
-            // Bottom edge
+            // Bottom side
             if (i < M)
             {
                 RegisterMatrix<ET, TILE_SIZE, TILE_SIZE, columnMajor> ker;
@@ -136,7 +136,7 @@ namespace blast
         }
 
 
-        // Right edge
+        // Right side
         if (j < N)
         {
             size_t i = 0;
diff --git a/include/blast/system/Inline.hpp b/include/blast/system/Inline.hpp
new file mode 100644
index 00000000..8ec59251
--- /dev/null
+++ b/include/blast/system/Inline.hpp
@@ -0,0 +1,19 @@
+// Copyright 2023 Mikhail Katliar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <blaze/system/Inline.h>
+
+#define BLAST_ALWAYS_INLINE BLAZE_ALWAYS_INLINE