Skip to content

Commit

Permalink
Make the code compile for ARM again
Browse files Browse the repository at this point in the history
  • Loading branch information
Mikhail Katliar authored and mkatliar committed Sep 19, 2024
1 parent 5e48ee7 commit 9defa20
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 10 deletions.
4 changes: 4 additions & 0 deletions include/blast/math/algorithm/Tile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
# include <blast/math/algorithm/arch/avx2/Tile.hpp>
#endif

#if XSIMD_WITH_NEON64
# include <blast/math/algorithm/arch/neon64/Tile.hpp>
#endif

#include <blast/math/StorageOrder.hpp>

#include <cstdlib>
Expand Down
3 changes: 2 additions & 1 deletion include/blast/math/algorithm/arch/avx2/Tile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <blast/system/Inline.hpp>
#include <blast/math/StorageOrder.hpp>
#include <blast/math/RegisterMatrix.hpp>
#include <blast/util/Types.hpp>

#include <blast/math/Simd.hpp>

Expand Down Expand Up @@ -46,7 +47,7 @@ namespace blast :: detail
BLAST_ALWAYS_INLINE void tile(xsimd::avx2 const& arch, StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial)
{
size_t constexpr SS = SimdSize_v<ET>;
size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be ppoperly determined
size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be properly determined

static_assert(SO == columnMajor, "tile() for row-major matrices not implemented");

Expand Down
141 changes: 141 additions & 0 deletions include/blast/math/algorithm/arch/neon64/Tile.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// Copyright 2024 Mikhail Katliar. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include <blast/system/Tile.hpp>
#include <blast/system/Inline.hpp>
#include <blast/math/StorageOrder.hpp>
#include <blast/math/RegisterMatrix.hpp>
#include <blast/util/Types.hpp>

#include <blast/math/Simd.hpp>

#include <cstdlib>


namespace blast :: detail
{
template <typename ET, size_t KM, size_t KN, StorageOrder SO, typename FF, typename FP>
BLAST_ALWAYS_INLINE void tile_backend(xsimd::neon64, size_t m, size_t n, size_t i, FF&& f_full, FP&& f_partial)
{
RegisterMatrix<ET, KM, KN, SO> ker;

if (i + KM <= m)
{
size_t j = 0;

for (; j + KN <= n; j += KN)
f_full(ker, i, j);

if (j < n)
f_partial(ker, i, j, KM, n - j);
}
else
{
size_t j = 0;

for (; j + KN <= n; j += KN)
f_partial(ker, i, j, m - i, KN);

if (j < n)
f_partial(ker, i, j, m - i, n - j);
}
}


template <typename ET, StorageOrder SO, typename FF, typename FP>
BLAST_ALWAYS_INLINE void tile(xsimd::neon64 const& arch, StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial)
{
size_t constexpr SS = SimdSize_v<ET>;
size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be properly determined

static_assert(SO == columnMajor, "tile() for row-major matrices not implemented");

if (traversal_order == columnMajor)
{
size_t j = 0;

// Main part
for (; j + TILE_STEP <= n; j += TILE_STEP)
{
size_t i = 0;

// i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE:
// it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel.
for (; i + 3 * SS <= m && i + 4 * SS != m; i += 3 * SS)
{
RegisterMatrix<ET, 3 * SS, TILE_STEP, SO> ker;
f_full(ker, i, j);
}

for (; i + 2 * SS <= m; i += 2 * SS)
{
RegisterMatrix<ET, 2 * SS, TILE_STEP, SO> ker;
f_full(ker, i, j);
}

for (; i + 1 * SS <= m; i += 1 * SS)
{
RegisterMatrix<ET, 1 * SS, TILE_STEP, SO> ker;
f_full(ker, i, j);
}

// Bottom side
if (i < m)
{
RegisterMatrix<ET, SS, TILE_STEP, SO> ker;
f_partial(ker, i, j, m - i, ker.columns());
}
}


// Right side
if (j < n)
{
size_t i = 0;

// i + 4 * TILE_STEP != M is to improve performance in case when the remaining number of rows is 4 * TILE_STEP:
// it is more efficient to apply 2 * TILE_STEP kernel 2 times than 3 * TILE_STEP + 1 * TILE_STEP kernel.
for (; i + 3 * SS <= m && i + 4 * SS != m; i += 3 * SS)
{
RegisterMatrix<ET, 3 * SS, TILE_STEP, SO> ker;
f_partial(ker, i, j, ker.rows(), n - j);
}

for (; i + 2 * SS <= m; i += 2 * SS)
{
RegisterMatrix<ET, 2 * SS, TILE_STEP, SO> ker;
f_partial(ker, i, j, ker.rows(), n - j);
}

for (; i + 1 * SS <= m; i += 1 * SS)
{
RegisterMatrix<ET, 1 * SS, TILE_STEP, SO> ker;
f_partial(ker, i, j, ker.rows(), n - j);
}

// Bottom-right corner
if (i < m)
{
RegisterMatrix<ET, SS, TILE_STEP, SO> ker;
f_partial(ker, i, j, m - i, n - j);
}
}
}
else
{
size_t i = 0;

// i + 4 * SS != M is to improve performance in case when the remaining number of rows is 4 * SS:
// it is more efficient to apply 2 * SS kernel 2 times than 3 * SS + 1 * SS kernel.
for (; i + 2 * SS < m && i + 4 * SS != m; i += 3 * SS)
tile_backend<ET, 3 * SS, TILE_STEP, SO>(arch, m, n, i, f_full, f_partial);

for (; i + 1 * SS < m; i += 2 * SS)
tile_backend<ET, 2 * SS, TILE_STEP, SO>(arch, m, n, i, f_full, f_partial);

for (; i + 0 * SS < m; i += 1 * SS)
tile_backend<ET, 1 * SS, TILE_STEP, SO>(arch, m, n, i, f_full, f_partial);
}
}
}
1 change: 1 addition & 0 deletions include/blast/math/dense/Trmm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#pragma once

#include <blast/math/dense/TrmmBackend.hpp>
#include <blast/system/Tile.hpp>

#include <blaze/util/Exception.h>
#include <blaze/util/constraints/SameType.h>
Expand Down
1 change: 0 additions & 1 deletion include/blast/math/panel/PanelSize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ namespace blast
*
* TODO: Is it always equal to SIMD size? Deprecate?
*
* @tparam T data type
* @tparam Arch architecture
*/
template <typename T, typename Arch = xsimd::default_arch>
Expand Down
12 changes: 4 additions & 8 deletions include/blast/system/Tile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,14 @@

#pragma once

//*************************************************************************************************
// Includes
//*************************************************************************************************

#include <blaze/math/simd/SIMDTrait.h>
#include <blast/util/Types.hpp>


namespace blast
{
using namespace blaze;


/**
* @brief TODO: deprecate?
*/
template <typename T>
struct TileSize;

Expand Down

0 comments on commit 9defa20

Please sign in to comment.