Optimize Renderer::fillVerticesAndIndices() (axmolengine#2065)

* Optimize `Renderer::fillVerticesAndIndices()` * Fix clobbered registers not being marked in inline assembly
Joilnen · Aug 2, 2024 · 807a91b · 807a91b
1 parent 377f340
commit 807a91b
Show file tree

Hide file tree

Showing 11 changed files with 559 additions and 116 deletions.
diff --git a/core/math/MathUtil.cpp b/core/math/MathUtil.cpp
@@ -1,6 +1,7 @@
 /**
 Copyright 2013 BlackBerry Inc.
 Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
+Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,6 +21,7 @@ This file was modified to fit the cocos2d-x project
 */
 
 #include "math/MathUtil.h"
+#include "math/Mat4.h"
 #include "base/Macros.h"
 
 #if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
@@ -35,28 +37,29 @@ This file was modified to fit the cocos2d-x project
 
 #if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS)
 #    if defined(__arm64__)
-#        define USE_NEON64
-#        define INCLUDE_NEON64
+#        define USE_NEON64 1
+#        define INCLUDE_NEON64 1
 #    elif defined(__ARM_NEON__)
-#        define USE_NEON32
-#        define INCLUDE_NEON32
-#    else
+#        define USE_NEON32 1
+#        define INCLUDE_NEON32 1
+#    endif
+#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX)
+#    if defined(__arm64__) || defined(__aarch64__)
+#        define USE_NEON64 1
+#        define INCLUDE_NEON64 1
 #    endif
 #elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
 #    if defined(__arm64__) || defined(__aarch64__)
-#        define USE_NEON64
-#        define INCLUDE_NEON64
+#        define USE_NEON64 1
+#        define INCLUDE_NEON64 1
 #    elif defined(__ARM_NEON__)
-#        define INCLUDE_NEON32
-#    else
+#        define INCLUDE_NEON32 1
 #    endif
-#else
-
 #endif
 
 #if defined(AX_USE_SSE)
-#    define USE_SSE
-#    define INCLUDE_SSE
+#    define USE_SSE 1
+#    define INCLUDE_SSE 1
 #endif
 
 #ifdef INCLUDE_NEON32
@@ -298,4 +301,34 @@ void MathUtil::crossVec3(const float* v1, const float* v2, float* dst)
 #endif
 }
 
+void MathUtil::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
+{
+    // Check some assumptions made by optimizations
+    static_assert(sizeof(V3F_C4B_T2F) == 24);
+    static_assert(offsetof(V3F_C4B_T2F, vertices) == 0);
+    static_assert(offsetof(V3F_C4B_T2F, colors) == 12);
+    static_assert(offsetof(V3F_C4B_T2F, texCoords) == 16);
+
+#ifdef USE_NEON32
+    MathUtilNeon::transformVertices(dst, src, count, transform);
+#elif defined(USE_NEON64)
+    MathUtilNeon64::transformVertices(dst, src, count, transform);
+#elif defined(INCLUDE_NEON32)
+    if (isNeon32Enabled())
+        MathUtilNeon::transformVertices(dst, src, count, transform);
+    else
+        MathUtilC::transformVertices(dst, src, count, transform);
+#else
+    MathUtilC::transformVertices(dst, src, count, transform);
+#endif
+}
+
+void MathUtil::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) {
+#if defined(USE_NEON64)
+    MathUtilNeon64::transformIndices(dst, src, count, offset);
+#else
+    MathUtilC::transformIndices(dst, src, count, offset);
+#endif
+}
+
 NS_AX_MATH_END
diff --git a/core/math/MathUtil.h b/core/math/MathUtil.h
@@ -2,6 +2,7 @@
  Copyright 2013 BlackBerry Inc.
  Copyright (c) 2014-2017 Chukong Technologies
  Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
+ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -29,13 +30,20 @@
 
 #include "math/MathBase.h"
 
+
+NS_AX_BEGIN
+    struct V3F_C4B_T2F;
+NS_AX_END
+
 /**
  * @addtogroup base
  * @{
  */
 
 NS_AX_MATH_BEGIN
 
+class Mat4;
+
 /**
  * Defines a math utility class.
  *
@@ -45,6 +53,7 @@ class AX_DLL MathUtil
 {
     friend class Mat4;
     friend class Vec3;
+    friend class Renderer;
 
 public:
     /**
@@ -130,6 +139,9 @@ class AX_DLL MathUtil
     static void transformVec4(const float* m, const float* v, float* dst);
 
     static void crossVec3(const float* v1, const float* v2, float* dst);
+
+    static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
+    static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
 };
 
 NS_AX_MATH_END

diff --git a/core/math/MathUtil.inl b/core/math/MathUtil.inl
@@ -1,5 +1,6 @@
 /**
  Copyright 2013 BlackBerry Inc.
+ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -24,24 +25,20 @@ class MathUtilC
 {
 public:
     inline static void addMatrix(const float* m, float scalar, float* dst);
-
     inline static void addMatrix(const float* m1, const float* m2, float* dst);
-
     inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
-
     inline static void multiplyMatrix(const float* m, float scalar, float* dst);
-
     inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
-    
+
     inline static void negateMatrix(const float* m, float* dst);
-
     inline static void transposeMatrix(const float* m, float* dst);
-    
+
     inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
-
     inline static void transformVec4(const float* m, const float* v, float* dst);
-
     inline static void crossVec3(const float* v1, const float* v2, float* dst);
+
+    inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
+    inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
 };
 
 inline void MathUtilC::addMatrix(const float* m, float scalar, float* dst)
@@ -128,27 +125,27 @@ inline void MathUtilC::multiplyMatrix(const float* m1, const float* m2, float* d
 {
     // Support the case where m1 or m2 is the same array as dst.
     float product[16];
-    
+
     product[0]  = m1[0] * m2[0]  + m1[4] * m2[1] + m1[8]   * m2[2]  + m1[12] * m2[3];
     product[1]  = m1[1] * m2[0]  + m1[5] * m2[1] + m1[9]   * m2[2]  + m1[13] * m2[3];
     product[2]  = m1[2] * m2[0]  + m1[6] * m2[1] + m1[10]  * m2[2]  + m1[14] * m2[3];
     product[3]  = m1[3] * m2[0]  + m1[7] * m2[1] + m1[11]  * m2[2]  + m1[15] * m2[3];
-    
+
     product[4]  = m1[0] * m2[4]  + m1[4] * m2[5] + m1[8]   * m2[6]  + m1[12] * m2[7];
     product[5]  = m1[1] * m2[4]  + m1[5] * m2[5] + m1[9]   * m2[6]  + m1[13] * m2[7];
     product[6]  = m1[2] * m2[4]  + m1[6] * m2[5] + m1[10]  * m2[6]  + m1[14] * m2[7];
     product[7]  = m1[3] * m2[4]  + m1[7] * m2[5] + m1[11]  * m2[6]  + m1[15] * m2[7];
-    
+
     product[8]  = m1[0] * m2[8]  + m1[4] * m2[9] + m1[8]   * m2[10] + m1[12] * m2[11];
     product[9]  = m1[1] * m2[8]  + m1[5] * m2[9] + m1[9]   * m2[10] + m1[13] * m2[11];
     product[10] = m1[2] * m2[8]  + m1[6] * m2[9] + m1[10]  * m2[10] + m1[14] * m2[11];
     product[11] = m1[3] * m2[8]  + m1[7] * m2[9] + m1[11]  * m2[10] + m1[15] * m2[11];
-    
+
     product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8]  * m2[14] + m1[12] * m2[15];
     product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9]  * m2[14] + m1[13] * m2[15];
     product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
     product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
-    
+
     memcpy(dst, product, MATRIX_SIZE);
 }
 
@@ -197,7 +194,7 @@ inline void MathUtilC::transformVec4(const float* m, const float* v, float* dst)
     float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13];
     float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14];
     float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];
-    
+
     dst[0] = x;
     dst[1] = y;
     dst[2] = z;
@@ -209,10 +206,39 @@ inline void MathUtilC::crossVec3(const float* v1, const float* v2, float* dst)
     float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
     float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
     float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
-    
+
     dst[0] = x;
     dst[1] = y;
     dst[2] = z;
 }
 
+inline void MathUtilC::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
+{
+    auto end = dst + count;
+    auto t = transform; // Make copy for better aliasing inference
+    auto m = t.m;
+
+    while (dst < end)
+    {
+        auto pos = src->vertices;
+        dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8]  + m[12];
+        dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9]  + m[13];
+        dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
+        memcpy(&dst->colors, &src->colors, sizeof(dst->colors) + sizeof(dst->texCoords));
+        ++dst;
+        ++src;
+    }
+}
+
+inline void MathUtilC::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
+{
+    auto end = dst + count;
+    while (dst < end)
+    {
+        *dst = *src + offset;
+        ++dst;
+        ++src;
+    }
+}
+
 NS_AX_MATH_END