Skip to content

Commit

Permalink
Optimize Renderer::fillVerticesAndIndices() (axmolengine#2065)
Browse files Browse the repository at this point in the history
* Optimize `Renderer::fillVerticesAndIndices()`

* Fix clobbered registers not being marked in inline assembly
  • Loading branch information
smilediver authored Aug 2, 2024
1 parent 377f340 commit 807a91b
Show file tree
Hide file tree
Showing 11 changed files with 559 additions and 116 deletions.
59 changes: 46 additions & 13 deletions core/math/MathUtil.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/**
Copyright 2013 BlackBerry Inc.
Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -20,6 +21,7 @@ This file was modified to fit the cocos2d-x project
*/

#include "math/MathUtil.h"
#include "math/Mat4.h"
#include "base/Macros.h"

#if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
Expand All @@ -35,28 +37,29 @@ This file was modified to fit the cocos2d-x project

#if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS)
# if defined(__arm64__)
# define USE_NEON64
# define INCLUDE_NEON64
# define USE_NEON64 1
# define INCLUDE_NEON64 1
# elif defined(__ARM_NEON__)
# define USE_NEON32
# define INCLUDE_NEON32
# else
# define USE_NEON32 1
# define INCLUDE_NEON32 1
# endif
#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX)
# if defined(__arm64__) || defined(__aarch64__)
# define USE_NEON64 1
# define INCLUDE_NEON64 1
# endif
#elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
# if defined(__arm64__) || defined(__aarch64__)
# define USE_NEON64
# define INCLUDE_NEON64
# define USE_NEON64 1
# define INCLUDE_NEON64 1
# elif defined(__ARM_NEON__)
# define INCLUDE_NEON32
# else
# define INCLUDE_NEON32 1
# endif
#else

#endif

#if defined(AX_USE_SSE)
# define USE_SSE
# define INCLUDE_SSE
# define USE_SSE 1
# define INCLUDE_SSE 1
#endif

#ifdef INCLUDE_NEON32
Expand Down Expand Up @@ -298,4 +301,34 @@ void MathUtil::crossVec3(const float* v1, const float* v2, float* dst)
#endif
}

void MathUtil::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
{
// Check some assumptions made by optimizations
static_assert(sizeof(V3F_C4B_T2F) == 24);
static_assert(offsetof(V3F_C4B_T2F, vertices) == 0);
static_assert(offsetof(V3F_C4B_T2F, colors) == 12);
static_assert(offsetof(V3F_C4B_T2F, texCoords) == 16);

#ifdef USE_NEON32
MathUtilNeon::transformVertices(dst, src, count, transform);
#elif defined(USE_NEON64)
MathUtilNeon64::transformVertices(dst, src, count, transform);
#elif defined(INCLUDE_NEON32)
if (isNeon32Enabled())
MathUtilNeon::transformVertices(dst, src, count, transform);
else
MathUtilC::transformVertices(dst, src, count, transform);
#else
MathUtilC::transformVertices(dst, src, count, transform);
#endif
}

void MathUtil::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) {
#if defined(USE_NEON64)
MathUtilNeon64::transformIndices(dst, src, count, offset);
#else
MathUtilC::transformIndices(dst, src, count, offset);
#endif
}

NS_AX_MATH_END
12 changes: 12 additions & 0 deletions core/math/MathUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Copyright 2013 BlackBerry Inc.
Copyright (c) 2014-2017 Chukong Technologies
Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,13 +30,20 @@

#include "math/MathBase.h"


NS_AX_BEGIN
struct V3F_C4B_T2F;
NS_AX_END

/**
* @addtogroup base
* @{
*/

NS_AX_MATH_BEGIN

class Mat4;

/**
* Defines a math utility class.
*
Expand All @@ -45,6 +53,7 @@ class AX_DLL MathUtil
{
friend class Mat4;
friend class Vec3;
friend class Renderer;

public:
/**
Expand Down Expand Up @@ -130,6 +139,9 @@ class AX_DLL MathUtil
static void transformVec4(const float* m, const float* v, float* dst);

static void crossVec3(const float* v1, const float* v2, float* dst);

static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
};

NS_AX_MATH_END
Expand Down
58 changes: 42 additions & 16 deletions core/math/MathUtil.inl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
Copyright 2013 BlackBerry Inc.
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -24,24 +25,20 @@ class MathUtilC
{
public:
inline static void addMatrix(const float* m, float scalar, float* dst);

inline static void addMatrix(const float* m1, const float* m2, float* dst);

inline static void subtractMatrix(const float* m1, const float* m2, float* dst);

inline static void multiplyMatrix(const float* m, float scalar, float* dst);

inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);

inline static void negateMatrix(const float* m, float* dst);

inline static void transposeMatrix(const float* m, float* dst);

inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);

inline static void transformVec4(const float* m, const float* v, float* dst);

inline static void crossVec3(const float* v1, const float* v2, float* dst);

inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
};

inline void MathUtilC::addMatrix(const float* m, float scalar, float* dst)
Expand Down Expand Up @@ -128,27 +125,27 @@ inline void MathUtilC::multiplyMatrix(const float* m1, const float* m2, float* d
{
// Support the case where m1 or m2 is the same array as dst.
float product[16];

product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3];
product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3];
product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3];
product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3];

product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7];
product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7];
product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7];
product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7];

product[8] = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11];
product[9] = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11];
product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11];
product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11];

product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15];
product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15];
product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];

memcpy(dst, product, MATRIX_SIZE);
}

Expand Down Expand Up @@ -197,7 +194,7 @@ inline void MathUtilC::transformVec4(const float* m, const float* v, float* dst)
float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13];
float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14];
float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];

dst[0] = x;
dst[1] = y;
dst[2] = z;
Expand All @@ -209,10 +206,39 @@ inline void MathUtilC::crossVec3(const float* v1, const float* v2, float* dst)
float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);

dst[0] = x;
dst[1] = y;
dst[2] = z;
}

inline void MathUtilC::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
{
auto end = dst + count;
auto t = transform; // Make copy for better aliasing inference
auto m = t.m;

while (dst < end)
{
auto pos = src->vertices;
dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12];
dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13];
dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
memcpy(&dst->colors, &src->colors, sizeof(dst->colors) + sizeof(dst->texCoords));
++dst;
++src;
}
}

inline void MathUtilC::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
{
auto end = dst + count;
while (dst < end)
{
*dst = *src + offset;
++dst;
++src;
}
}

NS_AX_MATH_END
Loading

0 comments on commit 807a91b

Please sign in to comment.