diff --git a/core/math/MathUtil.cpp b/core/math/MathUtil.cpp
index 6e6681665ce1..9fb49000b3b2 100644
--- a/core/math/MathUtil.cpp
+++ b/core/math/MathUtil.cpp
@@ -1,6 +1,7 @@
 /**
 Copyright 2013 BlackBerry Inc.
 Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
+Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,6 +21,7 @@ This file was modified to fit the cocos2d-x project
 */
 
 #include "math/MathUtil.h"
+#include "math/Mat4.h"
 #include "base/Macros.h"
 
 #if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
@@ -35,28 +37,29 @@ This file was modified to fit the cocos2d-x project
 
 #if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS)
 #    if defined(__arm64__)
-#        define USE_NEON64
-#        define INCLUDE_NEON64
+#        define USE_NEON64 1
+#        define INCLUDE_NEON64 1
 #    elif defined(__ARM_NEON__)
-#        define USE_NEON32
-#        define INCLUDE_NEON32
-#    else
+#        define USE_NEON32 1
+#        define INCLUDE_NEON32 1
+#    endif
+#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX)
+#    if defined(__arm64__) || defined(__aarch64__)
+#        define USE_NEON64 1
+#        define INCLUDE_NEON64 1
 #    endif
 #elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
 #    if defined(__arm64__) || defined(__aarch64__)
-#        define USE_NEON64
-#        define INCLUDE_NEON64
+#        define USE_NEON64 1
+#        define INCLUDE_NEON64 1
 #    elif defined(__ARM_NEON__)
-#        define INCLUDE_NEON32
-#    else
+#        define INCLUDE_NEON32 1
 #    endif
-#else
-
 #endif
 
 #if defined(AX_USE_SSE)
-#    define USE_SSE
-#    define INCLUDE_SSE
+#    define USE_SSE 1
+#    define INCLUDE_SSE 1
 #endif
 
 #ifdef INCLUDE_NEON32
@@ -298,4 +301,34 @@ void MathUtil::crossVec3(const float* v1, const float* v2, float* dst)
 #endif
 }
 
+void MathUtil::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
+{
+    // Check some assumptions made by optimizations
+    static_assert(sizeof(V3F_C4B_T2F) == 24);
+    static_assert(offsetof(V3F_C4B_T2F, vertices) == 0);
+    static_assert(offsetof(V3F_C4B_T2F, colors) == 12);
+    static_assert(offsetof(V3F_C4B_T2F, texCoords) == 16);
+
+#ifdef USE_NEON32
+    MathUtilNeon::transformVertices(dst, src, count, transform);
+#elif defined(USE_NEON64)
+    MathUtilNeon64::transformVertices(dst, src, count, transform);
+#elif defined(INCLUDE_NEON32)
+    if (isNeon32Enabled())
+        MathUtilNeon::transformVertices(dst, src, count, transform);
+    else
+        MathUtilC::transformVertices(dst, src, count, transform);
+#else
+    MathUtilC::transformVertices(dst, src, count, transform);
+#endif
+}
+
+void MathUtil::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) {
+#if defined(USE_NEON64)
+    MathUtilNeon64::transformIndices(dst, src, count, offset);
+#else
+    MathUtilC::transformIndices(dst, src, count, offset);
+#endif
+}
+
 NS_AX_MATH_END
diff --git a/core/math/MathUtil.h b/core/math/MathUtil.h
index 1fbe4060755f..7cb78b7845f0 100644
--- a/core/math/MathUtil.h
+++ b/core/math/MathUtil.h
@@ -2,6 +2,7 @@
  Copyright 2013 BlackBerry Inc.
  Copyright (c) 2014-2017 Chukong Technologies
  Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
+ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -29,6 +30,11 @@
 
 #include "math/MathBase.h"
 
+
+NS_AX_BEGIN
+    struct V3F_C4B_T2F;
+NS_AX_END
+
 /**
  * @addtogroup base
  * @{
@@ -36,6 +42,8 @@
 
 NS_AX_MATH_BEGIN
 
+class Mat4;
+
 /**
  * Defines a math utility class.
  *
@@ -45,6 +53,7 @@ class AX_DLL MathUtil
 {
     friend class Mat4;
     friend class Vec3;
+    friend class Renderer;
 
 public:
     /**
@@ -130,6 +139,9 @@ class AX_DLL MathUtil
     static void transformVec4(const float* m, const float* v, float* dst);
 
     static void crossVec3(const float* v1, const float* v2, float* dst);
+
+    static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
+    static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
 };
 
 NS_AX_MATH_END
diff --git a/core/math/MathUtil.inl b/core/math/MathUtil.inl
index 397c336d8e44..4d7028bdbd59 100644
--- a/core/math/MathUtil.inl
+++ b/core/math/MathUtil.inl
@@ -1,5 +1,6 @@
 /**
  Copyright 2013 BlackBerry Inc.
+ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -24,24 +25,20 @@ class MathUtilC
 {
 public:
     inline static void addMatrix(const float* m, float scalar, float* dst);
-    
     inline static void addMatrix(const float* m1, const float* m2, float* dst);
-    
     inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
-    
     inline static void multiplyMatrix(const float* m, float scalar, float* dst);
-    
     inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
-    
+
     inline static void negateMatrix(const float* m, float* dst);
-    
     inline static void transposeMatrix(const float* m, float* dst);
-    
+
     inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
-    
     inline static void transformVec4(const float* m, const float* v, float* dst);
-    
     inline static void crossVec3(const float* v1, const float* v2, float* dst);
+
+    inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
+    inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
 };
 
 inline void MathUtilC::addMatrix(const float* m, float scalar, float* dst)
@@ -128,27 +125,27 @@ inline void MathUtilC::multiplyMatrix(const float* m1, const float* m2, float* d
 {
     // Support the case where m1 or m2 is the same array as dst.
     float product[16];
-    
+
     product[0]  = m1[0] * m2[0]  + m1[4] * m2[1] + m1[8]   * m2[2]  + m1[12] * m2[3];
     product[1]  = m1[1] * m2[0]  + m1[5] * m2[1] + m1[9]   * m2[2]  + m1[13] * m2[3];
     product[2]  = m1[2] * m2[0]  + m1[6] * m2[1] + m1[10]  * m2[2]  + m1[14] * m2[3];
     product[3]  = m1[3] * m2[0]  + m1[7] * m2[1] + m1[11]  * m2[2]  + m1[15] * m2[3];
-    
+
     product[4]  = m1[0] * m2[4]  + m1[4] * m2[5] + m1[8]   * m2[6]  + m1[12] * m2[7];
     product[5]  = m1[1] * m2[4]  + m1[5] * m2[5] + m1[9]   * m2[6]  + m1[13] * m2[7];
     product[6]  = m1[2] * m2[4]  + m1[6] * m2[5] + m1[10]  * m2[6]  + m1[14] * m2[7];
     product[7]  = m1[3] * m2[4]  + m1[7] * m2[5] + m1[11]  * m2[6]  + m1[15] * m2[7];
-    
+
     product[8]  = m1[0] * m2[8]  + m1[4] * m2[9] + m1[8]   * m2[10] + m1[12] * m2[11];
     product[9]  = m1[1] * m2[8]  + m1[5] * m2[9] + m1[9]   * m2[10] + m1[13] * m2[11];
     product[10] = m1[2] * m2[8]  + m1[6] * m2[9] + m1[10]  * m2[10] + m1[14] * m2[11];
     product[11] = m1[3] * m2[8]  + m1[7] * m2[9] + m1[11]  * m2[10] + m1[15] * m2[11];
-    
+
     product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8]  * m2[14] + m1[12] * m2[15];
     product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9]  * m2[14] + m1[13] * m2[15];
     product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
     product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
-    
+
     memcpy(dst, product, MATRIX_SIZE);
 }
 
@@ -197,7 +194,7 @@ inline void MathUtilC::transformVec4(const float* m, const float* v, float* dst)
     float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13];
     float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14];
     float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];
-    
+
     dst[0] = x;
     dst[1] = y;
     dst[2] = z;
@@ -209,10 +206,39 @@ inline void MathUtilC::crossVec3(const float* v1, const float* v2, float* dst)
     float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
     float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
     float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
-    
+
     dst[0] = x;
     dst[1] = y;
     dst[2] = z;
 }
 
+inline void MathUtilC::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
+{
+    auto end = dst + count;
+    auto t = transform; // Make copy for better aliasing inference
+    auto m = t.m;
+
+    while (dst < end)
+    {
+        auto pos = src->vertices;
+        dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8]  + m[12];
+        dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9]  + m[13];
+        dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
+        memcpy(&dst->colors, &src->colors, sizeof(dst->colors) + sizeof(dst->texCoords));
+        ++dst;
+        ++src;
+    }
+}
+
+inline void MathUtilC::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
+{
+    auto end = dst + count;
+    while (dst < end)
+    {
+        *dst = *src + offset;
+        ++dst;
+        ++src;
+    }
+}
+
 NS_AX_MATH_END
diff --git a/core/math/MathUtilNeon.inl b/core/math/MathUtilNeon.inl
index 7479649cb0a0..e80382490351 100644
--- a/core/math/MathUtilNeon.inl
+++ b/core/math/MathUtilNeon.inl
@@ -1,5 +1,6 @@
 /**
  Copyright 2013 BlackBerry Inc.
+ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -17,30 +18,28 @@
 
  This file was modified to fit the cocos2d-x project
  */
+
+#include <arm_neon.h>
+
 NS_AX_MATH_BEGIN
 
 class MathUtilNeon
 {
 public:
     inline static void addMatrix(const float* m, float scalar, float* dst);
-    
     inline static void addMatrix(const float* m1, const float* m2, float* dst);
-    
     inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
-    
     inline static void multiplyMatrix(const float* m, float scalar, float* dst);
-    
     inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
-    
+
     inline static void negateMatrix(const float* m, float* dst);
-    
     inline static void transposeMatrix(const float* m, float* dst);
-    
+
     inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
-    
     inline static void transformVec4(const float* m, const float* v, float* dst);
-    
     inline static void crossVec3(const float* v1, const float* v2, float* dst);
+
+    inline static void transformVertices(ax::V3F_C4B_T2F* dst, const ax::V3F_C4B_T2F* src, size_t count, const ax::Mat4& transform);
 };
 
 inline void MathUtilNeon::addMatrix(const float* m, float scalar, float* dst)
@@ -52,12 +51,12 @@ inline void MathUtilNeon::addMatrix(const float* m, float scalar, float* dst)
                  "vmov.f32 s17, s16          \n\t" // s
                  "vmov.f32 s18, s16          \n\t" // s
                  "vmov.f32 s19, s16          \n\t" // s
-                 
+
                  "vadd.f32 q8, q0, q4        \n\t" // DST->M[m0-m3] = M[m0-m3] + s
                  "vadd.f32 q9, q1, q4        \n\t" // DST->M[m4-m7] = M[m4-m7] + s
                  "vadd.f32 q10, q2, q4       \n\t" // DST->M[m8-m11] = M[m8-m11] + s
                  "vadd.f32 q11, q3, q4       \n\t" // DST->M[m12-m15] = M[m12-m15] + s
-                 
+
                  "vst1.32 {q8, q9}, [%0]!    \n\t" // DST->M[m0-m7]
                  "vst1.32 {q10, q11}, [%0]   \n\t" // DST->M[m8-m15]
                  :
@@ -73,12 +72,12 @@ inline void MathUtilNeon::addMatrix(const float* m1, const float* m2, float* dst
                  "vld1.32     {q2, q3},     [%1]  \n\t" // M1[m8-m15]
                  "vld1.32     {q8, q9},     [%2]! \n\t" // M2[m0-m7]
                  "vld1.32     {q10, q11}, [%2]    \n\t" // M2[m8-m15]
-                 
+
                  "vadd.f32   q12, q0, q8          \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3]
                  "vadd.f32   q13, q1, q9          \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7]
                  "vadd.f32   q14, q2, q10         \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
                  "vadd.f32   q15, q3, q11         \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
-                 
+
                  "vst1.32    {q12, q13}, [%0]!    \n\t" // DST->M[m0-m7]
                  "vst1.32    {q14, q15}, [%0]     \n\t" // DST->M[m8-m15]
                  :
@@ -94,12 +93,12 @@ inline void MathUtilNeon::subtractMatrix(const float* m1, const float* m2, float
                  "vld1.32     {q2, q3},     [%1]   \n\t" // M1[m8-m15]
                  "vld1.32     {q8, q9},     [%2]!  \n\t" // M2[m0-m7]
                  "vld1.32     {q10, q11}, [%2]     \n\t" // M2[m8-m15]
-                 
+
                  "vsub.f32   q12, q0, q8         \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
                  "vsub.f32   q13, q1, q9         \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
                  "vsub.f32   q14, q2, q10        \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11]
                  "vsub.f32   q15, q3, q11        \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15]
-                 
+
                  "vst1.32    {q12, q13}, [%0]!   \n\t" // DST->M[m0-m7]
                  "vst1.32    {q14, q15}, [%0]    \n\t" // DST->M[m8-m15]
                  :
@@ -114,12 +113,12 @@ inline void MathUtilNeon::multiplyMatrix(const float* m, float scalar, float* ds
                  "vld1.32     {d0[0]},         [%2]        \n\t" // M[m0-m7]
                  "vld1.32    {q4-q5},          [%1]!       \n\t" // M[m8-m15]
                  "vld1.32    {q6-q7},          [%1]        \n\t" // s
-                 
+
                  "vmul.f32     q8, q4, d0[0]               \n\t" // DST->M[m0-m3] = M[m0-m3] * s
                  "vmul.f32     q9, q5, d0[0]               \n\t" // DST->M[m4-m7] = M[m4-m7] * s
                  "vmul.f32     q10, q6, d0[0]              \n\t" // DST->M[m8-m11] = M[m8-m11] * s
                  "vmul.f32     q11, q7, d0[0]              \n\t" // DST->M[m12-m15] = M[m12-m15] * s
-                 
+
                  "vst1.32     {q8-q9},           [%0]!     \n\t" // DST->M[m0-m7]
                  "vst1.32     {q10-q11},         [%0]      \n\t" // DST->M[m8-m15]
                  :
@@ -135,30 +134,30 @@ inline void MathUtilNeon::multiplyMatrix(const float* m1, const float* m2, float
                  "vld1.32     {d20 - d23}, [%1]  \n\t"       // M1[m8-m15]
                  "vld1.32     {d0 - d3}, [%2]!   \n\t"       // M2[m0-m7]
                  "vld1.32     {d4 - d7}, [%2]    \n\t"       // M2[m8-m15]
-                 
+
                  "vmul.f32    q12, q8, d0[0]     \n\t"         // DST->M[m0-m3] = M1[m0-m3] * M2[m0]
                  "vmul.f32    q13, q8, d2[0]     \n\t"         // DST->M[m4-m7] = M1[m4-m7] * M2[m4]
                  "vmul.f32    q14, q8, d4[0]     \n\t"         // DST->M[m8-m11] = M1[m8-m11] * M2[m8]
                  "vmul.f32    q15, q8, d6[0]     \n\t"         // DST->M[m12-m15] = M1[m12-m15] * M2[m12]
-                 
+
                  "vmla.f32    q12, q9, d0[1]     \n\t"         // DST->M[m0-m3] += M1[m0-m3] * M2[m1]
                  "vmla.f32    q13, q9, d2[1]     \n\t"         // DST->M[m4-m7] += M1[m4-m7] * M2[m5]
                  "vmla.f32    q14, q9, d4[1]     \n\t"         // DST->M[m8-m11] += M1[m8-m11] * M2[m9]
                  "vmla.f32    q15, q9, d6[1]     \n\t"         // DST->M[m12-m15] += M1[m12-m15] * M2[m13]
-                 
+
                  "vmla.f32    q12, q10, d1[0]    \n\t"         // DST->M[m0-m3] += M1[m0-m3] * M2[m2]
                  "vmla.f32    q13, q10, d3[0]    \n\t"         // DST->M[m4-m7] += M1[m4-m7] * M2[m6]
                  "vmla.f32    q14, q10, d5[0]    \n\t"         // DST->M[m8-m11] += M1[m8-m11] * M2[m10]
                  "vmla.f32    q15, q10, d7[0]    \n\t"         // DST->M[m12-m15] += M1[m12-m15] * M2[m14]
-                 
+
                  "vmla.f32    q12, q11, d1[1]    \n\t"         // DST->M[m0-m3] += M1[m0-m3] * M2[m3]
                  "vmla.f32    q13, q11, d3[1]    \n\t"         // DST->M[m4-m7] += M1[m4-m7] * M2[m7]
                  "vmla.f32    q14, q11, d5[1]    \n\t"         // DST->M[m8-m11] += M1[m8-m11] * M2[m11]
                  "vmla.f32    q15, q11, d7[1]    \n\t"         // DST->M[m12-m15] += M1[m12-m15] * M2[m15]
-                 
+
                  "vst1.32    {d24 - d27}, [%0]!  \n\t"       // DST->M[m0-m7]
                  "vst1.32    {d28 - d31}, [%0]   \n\t"       // DST->M[m8-m15]
-                 
+
                  : // output
                  : "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change.
                  : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -170,12 +169,12 @@ inline void MathUtilNeon::negateMatrix(const float* m, float* dst)
     asm volatile(
                  "vld1.32     {q0-q1},  [%1]!     \n\t" // load m0-m7
                  "vld1.32     {q2-q3},  [%1]      \n\t" // load m8-m15
-                 
+
                  "vneg.f32     q4, q0             \n\t" // negate m0-m3
                  "vneg.f32     q5, q1             \n\t" // negate m4-m7
                  "vneg.f32     q6, q2             \n\t" // negate m8-m15
                  "vneg.f32     q7, q3             \n\t" // negate m8-m15
-                 
+
                  "vst1.32     {q4-q5},  [%0]!     \n\t" // store m0-m7
                  "vst1.32     {q6-q7},  [%0]      \n\t" // store m8-m15
                  :
@@ -191,7 +190,7 @@ inline void MathUtilNeon::transposeMatrix(const float* m, float* dst)
                  "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1]!    \n\t" // DST->M[m1, m5, m9, m12] = M[m4-m7]
                  "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%1]!    \n\t" // DST->M[m2, m6, m10, m12] = M[m8-m11]
                  "vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%1]     \n\t" // DST->M[m3, m7, m11, m12] = M[m12-m15]
-                 
+
                  "vst1.32 {q0-q1}, [%0]!                         \n\t" // DST->M[m0-m7]
                  "vst1.32 {q2-q3}, [%0]                          \n\t" // DST->M[m8-m15]
                  :
@@ -209,12 +208,12 @@ inline void MathUtilNeon::transformVec4(const float* m, float x, float y, float
                  "vld1.32    {d1[1]},        [%4]    \n\t"    // V[w]
                  "vld1.32    {d18 - d21},    [%5]!   \n\t"    // M[m0-m7]
                  "vld1.32    {d22 - d25},    [%5]    \n\t"    // M[m8-m15]
-                 
+
                  "vmul.f32 q13,  q9, d0[0]           \n\t"    // DST->V = M[m0-m3] * V[x]
                  "vmla.f32 q13, q10, d0[1]           \n\t"    // DST->V += M[m4-m7] * V[y]
                  "vmla.f32 q13, q11, d1[0]           \n\t"    // DST->V += M[m8-m11] * V[z]
                  "vmla.f32 q13, q12, d1[1]           \n\t"    // DST->V += M[m12-m15] * V[w]
-                 
+
                  "vst1.32 {d26}, [%0]!               \n\t"    // DST->V[x, y]
                  "vst1.32 {d27[0]}, [%0]             \n\t"    // DST->V[z]
                  :
@@ -230,12 +229,12 @@ inline void MathUtilNeon::transformVec4(const float* m, const float* v, float* d
      "vld1.32    {d0, d1}, [%1]     \n\t"   // V[x, y, z, w]
      "vld1.32    {d18 - d21}, [%2]! \n\t"   // M[m0-m7]
      "vld1.32    {d22 - d25}, [%2]  \n\t"    // M[m8-m15]
-     
+
      "vmul.f32   q13, q9, d0[0]     \n\t"   // DST->V = M[m0-m3] * V[x]
      "vmla.f32   q13, q10, d0[1]    \n\t"   // DST->V = M[m4-m7] * V[y]
      "vmla.f32   q13, q11, d1[0]    \n\t"   // DST->V = M[m8-m11] * V[z]
      "vmla.f32   q13, q12, d1[1]    \n\t"   // DST->V = M[m12-m15] * V[w]
-     
+
      "vst1.32    {d26, d27}, [%0]   \n\t"   // DST->V
      :
      : "r"(dst), "r"(v), "r"(m)
@@ -249,17 +248,17 @@ inline void MathUtilNeon::crossVec3(const float* v1, const float* v2, float* dst
                  "vld1.32 {d1[1]},  [%1]         \n\t" //
                  "vld1.32 {d0},     [%2]         \n\t" //
                  "vmov.f32 s2, s1                \n\t" // q0 = (v1y, v1z, v1z, v1x)
-                 
+
                  "vld1.32 {d2[1]},  [%3]         \n\t" //
                  "vld1.32 {d3},     [%4]         \n\t" //
                  "vmov.f32 s4, s7                  \n\t" // q1 = (v2z, v2x, v2y, v2z)
-                 
+
                  "vmul.f32 d4, d0, d2            \n\t" // x = v1y * v2z, y = v1z * v2x
                  "vmls.f32 d4, d1, d3            \n\t" // x -= v1z * v2y, y-= v1x - v2z
-                 
+
                  "vmul.f32 d5, d3, d1[1]         \n\t" // z = v1x * v2y
                  "vmls.f32 d5, d0, d2[1]         \n\t" // z-= v1y * vx
-                 
+
                  "vst1.32 {d4},       [%0]!      \n\t" // V[x, y]
                  "vst1.32 {d5[0]}, [%0]          \n\t" // V[z]
                  :
@@ -268,4 +267,105 @@ inline void MathUtilNeon::crossVec3(const float* v1, const float* v2, float* dst
                  );
 }
 
+inline void MathUtilNeon::transformVertices(ax::V3F_C4B_T2F* dst, const ax::V3F_C4B_T2F* src, size_t count, const ax::Mat4& transform)
+{
+    auto end = dst + count;
+
+    // Load matrix
+    float32x4_t mc0 = vld1q_f32(transform.m);
+    float32x4_t mc1 = vld1q_f32(transform.m + 4);
+    float32x4_t mc2 = vld1q_f32(transform.m + 8);
+    float32x4_t mc3 = vld1q_f32(transform.m + 12);
+
+    // Process 4 vertices at a time
+    auto end4 = dst + count / 4 * 4;
+    while (dst < end4)
+    {
+        // Load 4 vertices. Note that color will also get loaded into w
+        float32x2_t xy0 = vld1_f32(&src[0].vertices.x);
+        float32x2_t zw0 = vld1_f32(&src[0].vertices.z);
+        float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
+        float32x2_t xy1 = vld1_f32(&src[1].vertices.x);
+        float32x2_t zw1 = vld1_f32(&src[1].vertices.z);
+        float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
+        float32x2_t xy2 = vld1_f32(&src[2].vertices.x);
+        float32x2_t zw2 = vld1_f32(&src[2].vertices.z);
+        float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
+        float32x2_t xy3 = vld1_f32(&src[3].vertices.x);
+        float32x2_t zw3 = vld1_f32(&src[3].vertices.z);
+        float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
+
+        // Multiply x by column 0
+        float32x4_t r0 = vmulq_lane_f32(mc0, xy0, 0);
+        float32x4_t r1 = vmulq_lane_f32(mc0, xy1, 0);
+        float32x4_t r2 = vmulq_lane_f32(mc0, xy2, 0);
+        float32x4_t r3 = vmulq_lane_f32(mc0, xy3, 0);
+
+        // Multiply y by column 1 and add to result
+        r0 = vmlaq_lane_f32(r0, mc1, xy0, 1);
+        r1 = vmlaq_lane_f32(r1, mc1, xy1, 1);
+        r2 = vmlaq_lane_f32(r2, mc1, xy2, 1);
+        r3 = vmlaq_lane_f32(r3, mc1, xy3, 1);
+
+        // Multiply z by column 2 and add to result
+        r0 = vmlaq_lane_f32(r0, mc2, zw0, 0);
+        r1 = vmlaq_lane_f32(r1, mc2, zw1, 0);
+        r2 = vmlaq_lane_f32(r2, mc2, zw2, 0);
+        r3 = vmlaq_lane_f32(r3, mc2, zw3, 0);
+
+        // Add column 3
+        r0 = vaddq_f32(r0, mc3);
+        r1 = vaddq_f32(r1, mc3);
+        r2 = vaddq_f32(r2, mc3);
+        r3 = vaddq_f32(r3, mc3);
+
+        // Set color
+        r0 = vsetq_lane_f32(vget_lane_f32(zw0, 1), r0, 3);
+        r1 = vsetq_lane_f32(vget_lane_f32(zw1, 1), r1, 3);
+        r2 = vsetq_lane_f32(vget_lane_f32(zw2, 1), r2, 3);
+        r3 = vsetq_lane_f32(vget_lane_f32(zw3, 1), r3, 3);
+
+        // Store result
+        vst1q_f32(&dst[0].vertices.x, r0);
+        vst1_f32(&dst[0].texCoords.u, uv0);
+        vst1q_f32(&dst[1].vertices.x, r1);
+        vst1_f32(&dst[1].texCoords.u, uv1);
+        vst1q_f32(&dst[2].vertices.x, r2);
+        vst1_f32(&dst[2].texCoords.u, uv2);
+        vst1q_f32(&dst[3].vertices.x, r3);
+        vst1_f32(&dst[3].texCoords.u, uv3);
+
+        dst += 4;
+        src += 4;
+    }
+
+    // Process remaining vertices
+    while (dst < end)
+    {
+        // Load vertex
+        float32x2_t xy = vld1_f32(&src->vertices.x);
+        float32x2_t zw = vld1_f32(&src->vertices.z);
+        float32x2_t uv = vld1_f32(&src->texCoords.u);
+
+        // Multiply x by column 0
+        float32x4_t r = vmulq_lane_f32(mc0, xy, 0);
+        // Multiply y by column 1 and add to result
+        r = vmlaq_lane_f32(r, mc1, xy, 1);
+        // Multiply z by column 2 and add to result
+        r = vmlaq_lane_f32(r, mc2, zw, 0);
+        // Add column 3
+        r = vaddq_f32(r, mc3);
+
+        // Set color
+        r = vsetq_lane_f32(vget_lane_f32(zw, 1), r, 3);
+
+        // Store result
+        vst1q_f32(&dst->vertices.x, r);
+        vst1_f32(&dst->texCoords.u, uv);
+
+        ++dst;
+        ++src;
+    }
+}
+
 NS_AX_MATH_END
diff --git a/core/math/MathUtilNeon64.inl b/core/math/MathUtilNeon64.inl
index 3f683afcbd93..1bfb02759dc1 100644
--- a/core/math/MathUtilNeon64.inl
+++ b/core/math/MathUtilNeon64.inl
@@ -1,5 +1,6 @@
 /**
  Copyright 2013 BlackBerry Inc.
+ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -18,30 +19,29 @@
  This file was modified to fit the cocos2d-x project
  */
 
+#include <arm_neon.h>
+#include "base/Types.h"
+
 NS_AX_MATH_BEGIN
 
 class MathUtilNeon64
 {
 public:
     inline static void addMatrix(const float* m, float scalar, float* dst);
-    
     inline static void addMatrix(const float* m1, const float* m2, float* dst);
-    
     inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
-    
     inline static void multiplyMatrix(const float* m, float scalar, float* dst);
-    
     inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
-    
+
     inline static void negateMatrix(const float* m, float* dst);
-    
     inline static void transposeMatrix(const float* m, float* dst);
-    
+
     inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
-    
     inline static void transformVec4(const float* m, const float* v, float* dst);
-    
     inline static void crossVec3(const float* v1, const float* v2, float* dst);
+
+    inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
+    inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
 };
 
 inline void MathUtilNeon64::addMatrix(const float* m, float scalar, float* dst)
@@ -54,7 +54,7 @@ inline void MathUtilNeon64::addMatrix(const float* m, float scalar, float* dst)
 	    "fadd v9.4s, v1.4s, v4.4s			\n\t" // DST->M[m4-m7] = M[m4-m7] + s
 	    "fadd v10.4s, v2.4s, v4.4s			\n\t" // DST->M[m8-m11] = M[m8-m11] + s
 	    "fadd v11.4s, v3.4s, v4.4s			\n\t" // DST->M[m12-m15] = M[m12-m15] + s
-	
+
         "st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] 	\n\t"    // Result in V9
 	    :
         : "r"(dst), "r"(m), "r"(&scalar)
@@ -73,7 +73,7 @@ inline void MathUtilNeon64::addMatrix(const float* m1, const float* m2, float* d
         "fadd   v14.4s, v2.4s, v10.4s         \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
         "fadd   v15.4s, v3.4s, v11.4s         \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
 
-        "st4    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0]    \n\t" // DST->M[m0-m7] DST->M[m8-m15] 
+        "st4    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0]    \n\t" // DST->M[m0-m7] DST->M[m8-m15]
         :
         : "r"(dst), "r"(m1), "r"(m2)
         : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
@@ -84,7 +84,7 @@ inline void MathUtilNeon64::subtractMatrix(const float* m1, const float* m2, flo
 {
     asm volatile(
         "ld4     {v0.4s, v1.4s, v2.4s, v3.4s},     [%1]  \n\t" // M1[m0-m7] M1[m8-m15]
-        "ld4     {v8.4s, v9.4s, v10.4s, v11.4s},   [%2]  \n\t" // M2[m0-m7] M2[m8-m15] 
+        "ld4     {v8.4s, v9.4s, v10.4s, v11.4s},   [%2]  \n\t" // M2[m0-m7] M2[m8-m15]
 
         "fsub   v12.4s, v0.4s, v8.4s         \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
         "fsub   v13.4s, v1.4s, v9.4s         \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
@@ -101,7 +101,7 @@ inline void MathUtilNeon64::subtractMatrix(const float* m1, const float* m2, flo
 inline void MathUtilNeon64::multiplyMatrix(const float* m, float scalar, float* dst)
 {
     asm volatile(
-        "ld1     {v0.s}[0],         [%2]            \n\t" //s  
+        "ld1     {v0.s}[0],         [%2]            \n\t" //s
         "ld4     {v4.4s, v5.4s, v6.4s, v7.4s}, [%1]       \n\t" //M[m0-m7] M[m8-m15]
 
         "fmul     v8.4s, v4.4s, v0.s[0]               \n\t" // DST->M[m0-m3] = M[m0-m3] * s
@@ -171,8 +171,8 @@ inline void MathUtilNeon64::negateMatrix(const float* m, float* dst)
 inline void MathUtilNeon64::transposeMatrix(const float* m, float* dst)
 {
     asm volatile(
-        "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1]    \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3] 
-							 //DST->M[m1, m5, m9, m12] = M[m4-m7] 
+        "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1]    \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3]
+							 //DST->M[m1, m5, m9, m12] = M[m4-m7]
         "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0]    \n\t"
         :
         : "r"(dst), "r"(m)
@@ -189,7 +189,7 @@ inline void MathUtilNeon64::transformVec4(const float* m, float x, float y, floa
         "ld1    {v0.s}[3],        [%4]    \n\t"    // V[w]
         "ld1    {v9.4s, v10.4s, v11.4s, v12.4s}, [%5]   \n\t"    // M[m0-m7] M[m8-m15]
 
-	
+
         "fmul v13.4s, v9.4s, v0.s[0]           \n\t"      // DST->V = M[m0-m3] * V[x]
         "fmla v13.4s, v10.4s, v0.s[1]           \n\t"    // DST->V += M[m4-m7] * V[y]
         "fmla v13.4s, v11.4s, v0.s[2]           \n\t"    // DST->V += M[m8-m11] * V[z]
@@ -198,8 +198,8 @@ inline void MathUtilNeon64::transformVec4(const float* m, float x, float y, floa
         //"st1 {v13.4s}, [%0]               \n\t"    // DST->V[x, y] // DST->V[z]
         "st1 {v13.2s}, [%0], 8               \n\t"
         "st1 {v13.s}[2], [%0]                \n\t"
-        :
-        : "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
+        : "+r"(dst)
+        : "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
         : "v0", "v9", "v10","v11", "v12", "v13", "memory"
     );
 }
@@ -256,10 +256,143 @@ inline void MathUtilNeon64::crossVec3(const float* v1, const float* v2, float* d
 
         "st1 {v2.2s},       [%0], 8      \n\t" // V[x, y]
         "st1 {v2.s}[2],     [%0]         \n\t" // V[z]
-        :
-        : "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
+        : "+r"(dst)
+        : "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
         : "v0", "v1", "v2", "memory"
     );
 }
 
+inline void MathUtilNeon64::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
+{
+    auto end = dst + count;
+
+    // Load matrix
+    float32x4x4_t m = vld1q_f32_x4(transform.m);
+
+    // Process 4 vertices at a time if there's enough data
+    auto end4 = dst + count / 4 * 4;
+    while (dst < end4)
+    {
+        // Do this for each vertex
+        // dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8]  + m[12];
+        // dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9]  + m[13];
+        // dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
+
+        // First, load each vertex, multiply x by column 0 and add to column 3
+        // Note: since we're reading 4 floats it will load color bytes into v.w
+        float32x4_t v0 = vld1q_f32(&src[0].vertices.x);
+        float32x4_t r0 = vmlaq_laneq_f32(m.val[3], m.val[0], v0, 0);
+        float32x4_t v1 = vld1q_f32(&src[1].vertices.x);
+        float32x4_t r1 = vmlaq_laneq_f32(m.val[3], m.val[0], v1, 0);
+        float32x4_t v2 = vld1q_f32(&src[2].vertices.x);
+        float32x4_t r2 = vmlaq_laneq_f32(m.val[3], m.val[0], v2, 0);
+        float32x4_t v3 = vld1q_f32(&src[3].vertices.x);
+        float32x4_t r3 = vmlaq_laneq_f32(m.val[3], m.val[0], v3, 0);
+
+        // Load texCoords
+        float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
+        float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
+        float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
+        float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
+
+        // Multiply y by column 1 and add to result
+        r0 = vmlaq_laneq_f32(r0, m.val[1], v0, 1);
+        r1 = vmlaq_laneq_f32(r1, m.val[1], v1, 1);
+        r2 = vmlaq_laneq_f32(r2, m.val[1], v2, 1);
+        r3 = vmlaq_laneq_f32(r3, m.val[1], v3, 1);
+
+        // Multiply z by column 2 and add to result
+        r0 = vmlaq_laneq_f32(r0, m.val[2], v0, 2);
+        r1 = vmlaq_laneq_f32(r1, m.val[2], v1, 2);
+        r2 = vmlaq_laneq_f32(r2, m.val[2], v2, 2);
+        r3 = vmlaq_laneq_f32(r3, m.val[2], v3, 2);
+
+        // Set w to loaded color
+        r0 = vsetq_lane_f32(vgetq_lane_f32(v0, 3), r0, 3);
+        r1 = vsetq_lane_f32(vgetq_lane_f32(v1, 3), r1, 3);
+        r2 = vsetq_lane_f32(vgetq_lane_f32(v2, 3), r2, 3);
+        r3 = vsetq_lane_f32(vgetq_lane_f32(v3, 3), r3, 3);
+
+        // Store result
+        vst1q_f32(&dst[0].vertices.x, r0);
+        vst1_f32(&dst[0].texCoords.u, uv0);
+        vst1q_f32(&dst[1].vertices.x, r1);
+        vst1_f32(&dst[1].texCoords.u, uv1);
+        vst1q_f32(&dst[2].vertices.x, r2);
+        vst1_f32(&dst[2].texCoords.u, uv2);
+        vst1q_f32(&dst[3].vertices.x, r3);
+        vst1_f32(&dst[3].texCoords.u, uv3);
+
+        dst += 4;
+        src += 4;
+    }
+
+    // Process remaining vertices one by one
+    while (dst < end)
+    {
+        float32x4_t v = vld1q_f32(&src->vertices.x);
+        float32x4_t r = vmlaq_laneq_f32(m.val[3], m.val[0], v, 0);
+        r = vmlaq_laneq_f32(r, m.val[1], v, 1);
+        r = vmlaq_laneq_f32(r, m.val[2], v, 2);
+        r = vsetq_lane_f32(vgetq_lane_f32(v, 3), r, 3);
+        float32x2_t uv = vld1_f32(&src->texCoords.u);
+        vst1q_f32(&dst->vertices.x, r);
+        vst1_f32(&dst->texCoords.u, uv);
+
+        ++dst;
+        ++src;
+    }
+}
+
+inline void MathUtilNeon64::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
+{
+    auto end = dst + count;
+    auto off = vdupq_n_u16(offset);
+
+    if (count < 8)
+        goto LEFTOVER;
+
+    // Process 32 indices at a time if there's enough data
+    while (count >= 32)
+    {
+        // Load 32 indices
+        uint16x8x4_t v = vld1q_u16_x4(src);
+
+        // Add offset
+        v.val[0] = vaddq_u16(v.val[0], off);
+        v.val[1] = vaddq_u16(v.val[1], off);
+        v.val[2] = vaddq_u16(v.val[2], off);
+        v.val[3] = vaddq_u16(v.val[3], off);
+
+        // Store result
+        vst1q_u16_x4(dst, v);
+
+        dst += 32;
+        src += 32;
+        count -= 32;
+    }
+
+    // Process 8 indices at a time if there's enough data
+    while (count >= 8)
+    {
+        uint16x8_t v = vld1q_u16(src);
+        v = vaddq_u16(v, off);
+        vst1q_u16(dst, v);
+
+        dst += 8;
+        src += 8;
+        count -= 8;
+    }
+
+LEFTOVER:
+    // Process remaining indices one by one
+    while (count > 0)
+    {
+        *dst = *src + offset;
+        ++dst;
+        ++src;
+        --count;
+    }
+}
+
 NS_AX_MATH_END
diff --git a/core/renderer/Renderer.cpp b/core/renderer/Renderer.cpp
index d3b98ba15abe..544a5026a31f 100644
--- a/core/renderer/Renderer.cpp
+++ b/core/renderer/Renderer.cpp
@@ -589,23 +589,17 @@ void Renderer::setViewPort(int x, int y, unsigned int w, unsigned int h)
 
 void Renderer::fillVerticesAndIndices(const TrianglesCommand* cmd, unsigned int vertexBufferOffset)
 {
-    size_t vertexCount = cmd->getVertexCount();
-    memcpy(&_verts[_filledVertex], cmd->getVertices(), sizeof(V3F_C4B_T2F) * vertexCount);
-
-    // fill vertex, and convert them to world coordinates
-    const Mat4& modelView = cmd->getModelView();
-    for (size_t i = 0; i < vertexCount; ++i)
-    {
-        modelView.transformPoint(&(_verts[i + _filledVertex].vertices));
-    }
-
-    // fill index
-    const unsigned short* indices = cmd->getIndices();
-    size_t indexCount             = cmd->getIndexCount();
-    for (size_t i = 0; i < indexCount; ++i)
-    {
-        _indices[_filledIndex + i] = vertexBufferOffset + _filledVertex + indices[i];
-    }
+    auto destVertices = &_verts[_filledVertex];
+    auto srcVertices = cmd->getVertices();
+    auto vertexCount = cmd->getVertexCount();
+    auto&& modelView = cmd->getModelView();
+    MathUtil::transformVertices(destVertices, srcVertices, vertexCount, modelView);
+
+    auto destIndices = &_indices[_filledIndex];
+    auto srcIndices = cmd->getIndices();
+    auto indexCount = cmd->getIndexCount();
+    auto offset = vertexBufferOffset + _filledVertex;
+    MathUtil::transformIndices(destIndices, srcIndices, indexCount, int(offset));
 
     _filledVertex += vertexCount;
     _filledIndex += indexCount;
diff --git a/tests/unit-tests/CMakeLists.txt b/tests/unit-tests/CMakeLists.txt
index f673f2e7bc4f..062589bb0742 100644
--- a/tests/unit-tests/CMakeLists.txt
+++ b/tests/unit-tests/CMakeLists.txt
@@ -42,6 +42,7 @@ _1klink("${sample-assets_SOURCE_DIR}/unit-tests/Content" "${CMAKE_CURRENT_LIST_D
 set(GAME_SOURCE
     Source/AppDelegate.cpp
     Source/doctest.cpp
+    Source/TestUtils.cpp
 
     Source/core/base/MapTests.cpp
     Source/core/base/UTF8Tests.cpp
@@ -74,13 +75,9 @@ elseif(WINDOWS)
 endif()
 
 if(ANDROID)
-    list(APPEND GAME_HEADER
-         Source/JNITest/JNITest.h
-         )
     list(APPEND GAME_SOURCE
-         Source/JNITest/JNITest.cpp
-         proj.android/app/jni/main.cpp
-         )
+        proj.android/app/jni/main.cpp
+    )
 elseif(LINUX)
     list(APPEND GAME_SOURCE
          proj.linux/main.cpp
diff --git a/tests/unit-tests/Source/TestUtils.cpp b/tests/unit-tests/Source/TestUtils.cpp
new file mode 100644
index 000000000000..ca4d083d4908
--- /dev/null
+++ b/tests/unit-tests/Source/TestUtils.cpp
@@ -0,0 +1,45 @@
+#include <doctest.h>
+#include "base/Types.h"
+#include "TestUtils.h"
+
+NS_AX_BEGIN
+
+
+doctest::String toString(const Vec2& value) {
+    std::string s;
+    s.append("(");
+    s.append(std::to_string(value.u));
+    s.append(", ");
+    s.append(std::to_string(value.v));
+    s.append(")");
+    return s.c_str();
+}
+
+doctest::String toString(const Vec3& value) {
+    std::string s;
+    s.append("(");
+    s.append(std::to_string(value.x));
+    s.append(", ");
+    s.append(std::to_string(value.y));
+    s.append(", ");
+    s.append(std::to_string(value.z));
+    s.append(")");
+    return s.c_str();
+}
+
+doctest::String toString(const Color4B& value) {
+    std::string s;
+    s.append("(");
+    s.append(std::to_string(value.r));
+    s.append(", ");
+    s.append(std::to_string(value.g));
+    s.append(", ");
+    s.append(std::to_string(value.b));
+    s.append(", ");
+    s.append(std::to_string(value.a));
+    s.append(")");
+    return s.c_str();
+}
+
+
+NS_AX_END
diff --git a/tests/unit-tests/Source/TestUtils.h b/tests/unit-tests/Source/TestUtils.h
index 1631b13534e5..a6bf0af45474 100644
--- a/tests/unit-tests/Source/TestUtils.h
+++ b/tests/unit-tests/Source/TestUtils.h
@@ -39,3 +39,10 @@ class AsyncRunner {
         return f.get();
     }
 };
+
+
+namespace ax {
+    doctest::String toString(const Color4B& value);
+    doctest::String toString(const Vec2& value);
+    doctest::String toString(const Vec3& value);
+}
diff --git a/tests/unit-tests/Source/core/math/MathUtilTests.cpp b/tests/unit-tests/Source/core/math/MathUtilTests.cpp
index c59a8c15264e..4c5e8523b41a 100644
--- a/tests/unit-tests/Source/core/math/MathUtilTests.cpp
+++ b/tests/unit-tests/Source/core/math/MathUtilTests.cpp
@@ -25,25 +25,29 @@
 
 #include <doctest.h>
 #include "base/Config.h"
+#include "base/Types.h"
+#include "TestUtils.h"
 
 #if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS)
     #if defined(__arm64__)
-        #define USE_NEON64
-        #define INCLUDE_NEON64
+        #define USE_NEON64 1
+        #define INCLUDE_NEON64 1
     #elif defined(__ARM_NEON__)
-        #define USE_NEON32
-        #define INCLUDE_NEON32
+        #define USE_NEON32 1
+        #define INCLUDE_NEON32 1
+    #endif
+#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX)
+    #if defined(__arm64__) || defined(__aarch64__)
+        #define USE_NEON64 1
+        #define INCLUDE_NEON64 1
     #endif
 #elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
     #if defined(__arm64__) || defined(__aarch64__)
-        #define USE_NEON64
-        #define INCLUDE_NEON64
+        #define USE_NEON64 1
+        #define INCLUDE_NEON64 1
     #elif defined(__ARM_NEON__)
-        #define INCLUDE_NEON32
+        #define INCLUDE_NEON32 1
     #endif
-#elif defined(AX_USE_SSE)
-    #define USE_SSE
-    #define INCLUDE_SSE
 #endif
 
 #if defined(USE_NEON32) || defined(USE_NEON64) // || defined(USE_SSE)
@@ -84,6 +88,96 @@ static void __checkMathUtilResult(std::string_view description, const float* a1,
 }
 
 
+TEST_SUITE("math/MathUtil") {
+    using namespace UnitTest::ax;
+
+
+    static void checkVerticesAreEqual(const V3F_C4B_T2F* v1, const V3F_C4B_T2F* v2, size_t count)
+    {
+        for (size_t i = 0; i < count; ++i)
+        {
+            CHECK_EQ(v1[i].vertices, v2[i].vertices);
+            CHECK_EQ(v1[i].colors, v2[i].colors);
+            CHECK_EQ(v1[i].texCoords, v2[i].texCoords);
+        }
+    }
+
+
+    TEST_CASE("transformVertices") {
+        auto count = 5;
+        std::vector<V3F_C4B_T2F> src(count);
+        std::vector<V3F_C4B_T2F> expected(count);
+        std::vector<V3F_C4B_T2F> dst(count);
+
+        for (int i = 0; i < count; ++i) {
+            src[i].vertices.set(float(i), float(i + 1), float(i + 2));
+            src[i].colors.set(uint8_t(i + 3), uint8_t(i + 4), uint8_t(i + 5), uint8_t(i + 6));
+            src[i].texCoords.set(float(i + 7), float(i + 8));
+
+            expected[i] = src[i];
+            expected[i].vertices.x = src[i].vertices.y * 4;
+            expected[i].vertices.y = src[i].vertices.x * -5;
+            expected[i].vertices.z = src[i].vertices.z * 6;
+        }
+
+        Mat4 transform(
+            0, 4, 0, 0,
+            -5, 0, 0, 0,
+            0, 0, 6, 0,
+            1, 2, 3, 1
+        );
+
+        SUBCASE("MathUtilC") {
+            MathUtilC::transformVertices(dst.data(), src.data(), count, transform);
+            checkVerticesAreEqual(expected.data(), dst.data(), count);
+        }
+
+        #if INCLUDE_NEON32
+            SUBCASE("MathUtilNeon") {
+                MathUtilNeon::transformVertices(dst.data(), src.data(), count, transform);
+                checkVerticesAreEqual(expected.data(), dst.data(), count);
+            }
+        #endif
+
+        #if INCLUDE_NEON64
+            SUBCASE("MathUtilNeon64") {
+                MathUtilNeon64::transformVertices(dst.data(), src.data(), count, transform);
+                checkVerticesAreEqual(expected.data(), dst.data(), count);
+            }
+        #endif
+    }
+
+    TEST_CASE("transformIndices") {
+        auto count = 43;
+        std::vector<uint16_t> src(count);
+        std::vector<uint16_t> expected(count);
+
+        for (int i = 0; i < count; ++i) {
+            src[i] = i;
+            expected[i] = i + 5;
+        }
+
+        uint16_t offset = 5;
+
+        SUBCASE("MathUtilC") {
+            std::vector<uint16_t> dst(count);
+            MathUtilC::transformIndices(dst.data(), src.data(), count, offset);
+            for (int i = 0; i < count; ++i)
+                CHECK_EQ(expected[i], dst[i]);
+        }
+
+        #if INCLUDE_NEON64
+            SUBCASE("MathUtilNeon64") {
+                std::vector<uint16_t> dst(count);
+                MathUtilNeon64::transformIndices(dst.data(), src.data(), count, offset);
+                for (int i = 0; i < count; ++i)
+                    CHECK_EQ(expected[i], dst[i]);
+            }
+        #endif
+    }
+}
+
+
 TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
     TEST_CASE("old_tests") {
         // I know the next line looks ugly, but it's a way to test MathUtil. :)
diff --git a/tests/unit-tests/proj.android/app/build.gradle b/tests/unit-tests/proj.android/app/build.gradle
index 97766c5f9b6b..9086c18ad1bd 100644
--- a/tests/unit-tests/proj.android/app/build.gradle
+++ b/tests/unit-tests/proj.android/app/build.gradle
@@ -70,6 +70,8 @@ android {
             proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
             if (project.hasProperty("KEY_STORE_FILE")) {
                 signingConfig signingConfigs.release
+            } else {
+                signingConfig signingConfigs.debug
             }
         }