From f70bd5383ace506298033b12877440aa03e541e9 Mon Sep 17 00:00:00 2001
From: wangyuan <wangyuan@cambricon.com>
Date: Mon, 23 Dec 2024 15:11:15 +0800
Subject: [PATCH 1/5] [Feature](mlu-ops): add mluApplyAdamW.

---
 bangc_helper_dtype.h                          | 113 ++++++++++++++++++
 bangc_kernels.h                               |  95 +++++++++++++++
 independent_build.sh                          |   7 +-
 kernels/adam_w/adam_w_union1.mlu              |  73 +++++++----
 mlu_op.h                                      |   6 +-
 samples/mlu-ops/abs_sample/build.sh           |   2 +-
 samples/mlu-ops/build.sh                      |   2 +-
 samples/mlu-ops/fault_sample/build.sh         |   2 +-
 samples/mlu-ops/poly_nms_sample/build.sh      |   2 +-
 scripts/gen_symbol_visibility_map.py          |   6 +
 test/mlu_op_gtest/CMakeLists.txt              |   4 +-
 test/mlu_op_gtest/pb_gtest/include/executor.h |   3 +
 .../pb_gtest/src/zoo/adam_w/adam_w.cpp        |  51 ++++----
 .../pb_gtest/src/zoo/adam_w/adam_w.h          |   1 +
 14 files changed, 315 insertions(+), 52 deletions(-)
 create mode 100644 bangc_helper_dtype.h
 create mode 100644 bangc_kernels.h
diff --git a/bangc_helper_dtype.h b/bangc_helper_dtype.h
new file mode 100644
index 000000000..a171593ec
--- /dev/null
+++ b/bangc_helper_dtype.h
@@ -0,0 +1,113 @@
+/*************************************************************************
+ * Copyright (C) [2024] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#pragma once
+
+/**
+ * Provides `BANG_WRAP_T(ptr_arg)` for .cc and `BANG_UNWRAP_T(ptr_arg)` for .mlu
+ * to bridge Eigen:: type and BANGC type
+ */
+
+#include <type_traits>
+
+struct bang_half_t;
+struct bang_bfloat16_t;
+
+namespace detail {
+/*
+ * `bang_wrap_data` and `bang_unwrap_data` could be the same thing,
+ *  but should be used in different scope
+ *
+ *  handle 'const DType', 'Dtype *',
+ *  could be implemented by SFINAE or just specialization
+ */
+template <typename DType, template <typename> class Impl,
+          typename RawType = DType>
+struct bang_trans_impl_ {
+  static_assert(std::is_same_v<RawType, DType>);
+  typedef DType type;
+};
+
+template <typename DType, template <typename> class Impl, typename RawType>
+struct bang_trans_impl_<DType*, Impl, RawType> {
+  typedef typename Impl<DType>::type* type;
+};
+
+template <typename DType, template <typename> class Impl, typename RawType>
+struct bang_trans_impl_<const DType, Impl, RawType> {
+  typedef const typename Impl<DType>::type type;
+};
+
+}  // namespace detail
+
+#define BANG_TRANS_TYPE_FROM_TO(TOKEN, From, To) \
+  template <>                                    \
+  struct TOKEN<From> {                           \
+    typedef To type;                             \
+  }
+
+/* For .cc/.cpp trans unknown type to wrapped type */
+#if !defined(__BANG__)
+
+namespace Eigen {
+struct half;
+struct bfloat16;
+}  // namespace Eigen
+
+template <typename DType>
+struct bang_wrap_data {
+  using type = typename detail::bang_trans_impl_<DType, bang_wrap_data>::type;
+};
+
+#define BANG_WRAP_TYPE_FROM_TO(From, To) \
+  BANG_TRANS_TYPE_FROM_TO(bang_wrap_data, From, To)
+
+BANG_WRAP_TYPE_FROM_TO(Eigen::half, bang_half_t);
+BANG_WRAP_TYPE_FROM_TO(Eigen::bfloat16, bang_bfloat16_t);
+
+template <typename T>
+using bang_wrap_data_t = typename bang_wrap_data<T>::type;
+
+#define BANG_WRAP_T(a) reinterpret_cast<bang_wrap_data_t<decltype(a)>>(a)
+
+#endif  // !defined(__BANG__)
+
+/* For .mlu trans intermediate type to mlu's underlying type */
+
+#if __BANG__
+template <typename DType>
+struct bang_unwrap_data {
+  using type = typename detail::bang_trans_impl_<DType, bang_unwrap_data>::type;
+};
+
+#define BANG_UNWRAP_TYPE_FROM_TO(From, To) \
+  BANG_TRANS_TYPE_FROM_TO(bang_unwrap_data, From, To)
+
+BANG_UNWRAP_TYPE_FROM_TO(bang_half_t, half);
+BANG_UNWRAP_TYPE_FROM_TO(bang_bfloat16_t, bfloat16_t);
+
+template <typename T>
+using bang_unwrap_data_t = typename bang_unwrap_data<T>::type;
+
+#define BANG_UNWRAP_T(a) reinterpret_cast<bang_unwrap_data_t<decltype(a)>>(a)
+
+#endif  // __BANG__
diff --git a/bangc_kernels.h b/bangc_kernels.h
new file mode 100644
index 000000000..5ffbcde31
--- /dev/null
+++ b/bangc_kernels.h
@@ -0,0 +1,95 @@
+/*************************************************************************
+ * Copyright (C) [2024] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef BANGC_KERNELS_H_
+#define BANGC_KERNELS_H_
+
+#ifndef NAMESPACE_BANGC_KERNELS_GEGIN
+#define NAMESPACE_BANGC_KERNELS_GEGIN namespace bangc_kernels {
+#endif
+
+NAMESPACE_BANGC_KERNELS_GEGIN
+
+#ifndef BANGC_KERNELS_WIN_API
+#ifdef _WIN32
+#define BANGC_KERNELS_WIN_API __stdcall
+#else
+#define BANGC_KERNELS_WIN_API
+#endif
+#endif
+
+typedef enum {
+  BANGC_KERNELS_STATUS_SUCCESS =
+      0, /*!< The operation is successfully completed. */
+  BANGC_KERNELS_STATUS_ALLOC_FAILED = 1,
+  /*!< This error occurs when the resource allocation fails, which is usually
+     caused by failing to call cnMallocHost due to exceeded memory usage. Make
+     sure that the memory allocated previously is deallocated as much as
+     possible. */
+  BANGC_KERNELS_STATUS_BAD_PARAM = 2,
+  /*!< Invalid value or parameters are passed to the function, including data
+     type, layout, dimensions, etc. */
+  BANGC_KERNELS_STATUS_INTERNAL_ERROR = 3,
+  /*!< An error occurs inside of the function, which may indicate an internal
+     error or bug in the library. This error is usually caused by failing to
+     call cnrtMemcpyAsync. Check whether the memory passed to the function is
+     deallocated before the completion of the routine. */
+  BANGC_KERNELS_STATUS_ARCH_MISMATCH = 4,
+  /*!< Invalid MLU device which is not supported by current function. */
+  BANGC_KERNELS_STATUS_EXECUTION_FAILED = 5,
+  /*!< An error occurs when the function fails to be executed on MLU device due
+     to multiple reasons. You can check whether the hardware environment, driver
+     version and other prerequisite libraries are correctly installed. */
+  BANGC_KERNELS_STATUS_NOT_SUPPORTED = 6,
+  /*!< An error occurs when the requested functionality is not supported in this
+     version but would be supported in the future. */
+  BANGC_KERNELS_STATUS_NUMERICAL_OVERFLOW = 7,
+  /*!< A numerical overflow occurs when executing the function, which is usually
+     due to large scale or inappropriate range of value of input tensor. */
+} bangcKernelsStatus_t;
+
+template <typename T>
+bangcKernelsStatus_t BANGC_KERNELS_WIN_API
+mluApplyAdamW(const cnrtQueue_t queue,
+              const float lr,
+              const float beta1,
+              const float beta2,
+              const float bias1,
+              const float bias2,
+              const float epsilon,
+              const float weight_decay,
+              const float scale,
+              const bool use_nesterov,
+              const size_t size,
+              T *param_h,
+              T *grad,
+              void *param,
+              void *momentum,
+              void *velocity);
+
+#ifndef NAMESPACE_BANGC_KERNELS_END
+#define NAMESPACE_BANGC_KERNELS_END }
+#endif
+
+NAMESPACE_BANGC_KERNELS_END
+
+#endif  // BANGC_KERNELS_H_
diff --git a/independent_build.sh b/independent_build.sh
index f52385ebb..9478d04a8 100755
--- a/independent_build.sh
+++ b/independent_build.sh
@@ -8,6 +8,7 @@ MLUOP_TARGET_CPU_ARCH=`uname -m`
 GEN_SYMBOL_VIS_FILE_PY="./scripts/gen_symbol_visibility_map.py"
 MLUOP_SYMBOL_VIS_FILE="symbol_visibility.map"
 TARGET_SYMBOL_FILE="mlu_op.h"
+TARGET_SYMBOL_FILE_LITE="bangc_kernels.h"
 PACKAGE_EXTRACT_DIR="dep_libs_extract"
 
 PROG_NAME=$(basename $0)  # current script filename, DO NOT EDIT
@@ -421,7 +422,7 @@ if [ "$OS_RELEASE_ID" = "centos" -a "$OS_RELEASE_VERSION_ID" = "7" ]; then
   fi
 fi
 
-if [[ "$(g++ --version | head -n1 | awk '{ print $3 }' | cut -d '.' -f1)" < "5" ]]; then
+if [[ "$(g++ --version | head -n1 | awk '{ print $3 }' | cut -d '.' -f1)" -lt "5" ]]; then
   prog_log_note "we do not support g++<5, try to activate devtoolset-8 env"
   source /opt/rh/devtoolset-8/enable && prog_log_warn "devtoolset-8 activated" \
     || ( prog_log_warn "source devtoolset-8 failed, ignore this info if you have set env TOOLCHAIN_ROOT, TARGET_C_COMPILER, TARGET_CXX_COMPILER properly (see more details in README.md)" && sleep 4 ) # I hope user will see it
@@ -459,8 +460,8 @@ export PATH=${NEUWARE_HOME}/bin:$PATH
 export LD_LIBRARY_PATH=${NEUWARE_HOME}/lib64:$LD_LIBRARY_PATH
 
 prog_log_info "generate ${MLUOP_SYMBOL_VIS_FILE} file."
-prog_log_info "python3 ${GEN_SYMBOL_VIS_FILE_PY} ${BUILD_PATH}/${MLUOP_SYMBOL_VIS_FILE} ${TARGET_SYMBOL_FILE}"
-python3 ${GEN_SYMBOL_VIS_FILE_PY} ${BUILD_PATH}/${MLUOP_SYMBOL_VIS_FILE} ${TARGET_SYMBOL_FILE}
+prog_log_info "python3 ${GEN_SYMBOL_VIS_FILE_PY} ${BUILD_PATH}/${MLUOP_SYMBOL_VIS_FILE} ${TARGET_SYMBOL_FILE} ${TARGET_SYMBOL_FILE_LITE}"
+python3 ${GEN_SYMBOL_VIS_FILE_PY} ${BUILD_PATH}/${MLUOP_SYMBOL_VIS_FILE} ${TARGET_SYMBOL_FILE} ${TARGET_SYMBOL_FILE_LITE}
 
 pushd ${BUILD_PATH} > /dev/null
   prog_log_info "Rmove cmake cache ${PWD}"
diff --git a/kernels/adam_w/adam_w_union1.mlu b/kernels/adam_w/adam_w_union1.mlu
index db9cc672a..2536b33e5 100644
--- a/kernels/adam_w/adam_w_union1.mlu
+++ b/kernels/adam_w/adam_w_union1.mlu
@@ -21,12 +21,14 @@
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *************************************************************************/
 
+#include "adam_w.h"
+
 #include <mlu.h>
-#include <cmath>
-#include <algorithm>
-#include "core/logging.h"
-#include "kernels/adam_w/adam_w.h"
-#include "kernels/utils/common.h"
+
+#include "bangc_helper_dtype.h"
+#include "bangc_kernels.h"
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
 
 #define SIZE_NRAM_PER_REGION PAD_DOWN((MAX_NRAM_SIZE / 12), NFU_ALIGN_SIZE)
 #define HIGH_PRECISION_MODE 1
@@ -193,7 +195,7 @@ __mlu_global__ void unionApplyAdamW(T *param_h, T *grad, float *param,
                 ddr_param - 2 * num_x * param_flag, ddr_momentum - 2 * num_x,
                 ddr_velocity - 2 * num_x, nbuf_paramh, nbuf_param,
                 nbuf_momentum, nbuf_velocity,
-                std::min(num_x, (int)(num_task - (i - 2) * num_x)),
+                MIN(num_x, (int)(num_task - (i - 2) * num_x)),
                 (i - 2) % 2 * pong);
     }
     // load data
@@ -201,15 +203,14 @@ __mlu_global__ void unionApplyAdamW(T *param_h, T *grad, float *param,
       loadData(nbuf_paramh, (T *)(nbuf_grad + pong / 2), nbuf_param,
                nbuf_momentum, nbuf_velocity, ddr_paramh, ddr_grad, ddr_param,
                ddr_momentum, ddr_velocity,
-               std::min(num_x, (int)(num_task - i * num_x)), i % 2 * pong);
+               MIN(num_x, (int)(num_task - i * num_x)), i % 2 * pong);
     }
     // compute
     if (i >= 1 && i <= num_iter) {
       computeAdamW(nbuf_paramh, (T *)(nbuf_grad + pong / 2), nbuf_param,
                    nbuf_grad, nbuf_momentum, nbuf_velocity, temp_1, temp_2, lr,
                    beta1, beta2, bias1, bias2, epsilon, weight_decay, scale,
-                   use_nesterov,
-                   std::min(num_x, (int)(num_task - (i - 1) * num_x)),
+                   use_nesterov, MIN(num_x, (int)(num_task - (i - 1) * num_x)),
                    (i - 1) % 2 * pong, param_flag);
     }
     ddr_paramh += num_x * paramh_flag;
@@ -228,16 +229,48 @@ mluOpStatus_t MLUOP_WIN_API KernelApplyAdamW(
     void *momentum, void *velocity, float lr, float beta1, float beta2,
     float bias1, float bias2, float epsilon, float weight_decay, float scale,
     bool use_nesterov, size_t size, mluOpDataType_t k_data_type) {
-  switch (k_data_type) {
-    default: {
-      LOG(ERROR) << "Not Implemented.";
-    }
-    case MLUOP_DTYPE_BFLOAT16: {
-      KERNEL_CHECK(unionApplyAdamW<bfloat16_t><<<k_dim, k_type, queue>>>(
-          (bfloat16_t *)param_h, (bfloat16_t *)grad, (float *)param,
-          (float *)momentum, (float *)velocity, lr, beta1, beta2, bias1, bias2,
-          epsilon, weight_decay, scale, use_nesterov, size));
-    }; break;
-  }
+  // launch kernel
+  unionApplyAdamW<bfloat16_t><<<k_dim, k_type, queue>>>(
+      (bfloat16_t *)param_h, (bfloat16_t *)grad, (float *)param,
+      (float *)momentum, (float *)velocity, lr, beta1, beta2, bias1, bias2,
+      epsilon, weight_decay, scale, use_nesterov, size);
   return MLUOP_STATUS_SUCCESS;
 }
+
+NAMESPACE_BANGC_KERNELS_GEGIN
+
+template <typename T>
+bangcKernelsStatus_t BANGC_KERNELS_WIN_API
+mluApplyAdamW(const cnrtQueue_t queue, const float lr, const float beta1,
+              const float beta2, const float bias1, const float bias2,
+              const float epsilon, const float weight_decay, const float scale,
+              const bool use_nesterov, size_t size, T *param_h, T *grad,
+              void *param, void *momentum, void *velocity) {
+  // set job type
+  int ordinal = -1;
+  int cluster_num;
+  int core_dim;
+  cnrtGetDevice(&ordinal);
+  cnrtDeviceGetAttribute(&core_dim, cnrtAttrMcorePerCluster, ordinal);
+  cnrtDeviceGetAttribute(&cluster_num, cnrtAttrMaxClusterPerUnionLimitTask,
+                         ordinal);
+  cnrtFunctionType_t k_type = cnrtFuncTypeUnion1;
+  cnrtDim3_t k_dim{.x = (uint32_t)core_dim, .y = (uint32_t)cluster_num, .z = 1};
+
+  // launch kernel
+  unionApplyAdamW<<<k_dim, k_type, queue>>>(
+      BANG_UNWRAP_T(param_h), BANG_UNWRAP_T(grad), (float *)param,
+      (float *)momentum, (float *)velocity, lr, beta1, beta2, bias1, bias2,
+      epsilon, weight_decay, scale, use_nesterov, size);
+  return BANGC_KERNELS_STATUS_SUCCESS;
+}
+
+#define IMPL_MLU_APPLY_ADAMW_KERNEL(DType)                                   \
+  template bangcKernelsStatus_t BANGC_KERNELS_WIN_API mluApplyAdamW(         \
+      const cnrtQueue_t, const float, const float, const float, const float, \
+      const float, const float, const float, const float, const bool,        \
+      const size_t, DType *, DType *, void *, void *, void *)
+
+IMPL_MLU_APPLY_ADAMW_KERNEL(bang_bfloat16_t);
+
+NAMESPACE_BANGC_KERNELS_END
diff --git a/mlu_op.h b/mlu_op.h
index cd3a1735b..4feda384d 100644
--- a/mlu_op.h
+++ b/mlu_op.h
@@ -20,8 +20,8 @@
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *************************************************************************/
-#ifndef MLUOP_EXAMPLE_H_
-#define MLUOP_EXAMPLE_H_
+#ifndef MLUOP_H_
+#define MLUOP_H_
 
 /******************************************************************************
  * MLU-OPS: Cambricon Open Source operator library for Network
@@ -14526,4 +14526,4 @@ mluOpLgamma(mluOpHandle_t handle,
 }
 #endif
 
-#endif  // MLUOP_EXAMPLE_H_
+#endif  // MLUOP_H_
diff --git a/samples/mlu-ops/abs_sample/build.sh b/samples/mlu-ops/abs_sample/build.sh
index 459b22600..d6f6fec67 100755
--- a/samples/mlu-ops/abs_sample/build.sh
+++ b/samples/mlu-ops/abs_sample/build.sh
@@ -9,7 +9,7 @@ if [ "$OS_RELEASE_ID" = "centos" -a "$OS_RELEASE_VERSION_ID" = "7" ]; then
   fi
 fi
 
-if [[ "$(g++ --version | head -n1 | awk '{ print $3 }' | cut -d '.' -f1)" < "5" ]]; then
+if [[ "$(g++ --version | head -n1 | awk '{ print $3 }' | cut -d '.' -f1)" -lt "5" ]]; then
   echo "we do not support g++<5, try to activate devtoolset-7 env"
   source /opt/rh/devtoolset-7/enable && echo "devtoolset-7 activated" \
     || ( echo "source devtoolset-7 failed, ignore this info if you have set env TOOLCHAIN_ROOT, TARGET_C_COMPILER, TARGET_CXX_COMPILER properly (see more details in README.md)" && sleep 4 ) # I hope user will see it
diff --git a/samples/mlu-ops/build.sh b/samples/mlu-ops/build.sh
index 8b0987113..0189ec7d8 100755
--- a/samples/mlu-ops/build.sh
+++ b/samples/mlu-ops/build.sh
@@ -9,7 +9,7 @@ if [ "$OS_RELEASE_ID" = "centos" -a "$OS_RELEASE_VERSION_ID" = "7" ]; then
   fi
 fi
 
-if [[ "$(g++ --version | head -n1 | awk '{ print $3 }' | cut -d '.' -f1)" < "5" ]]; then
+if [[ "$(g++ --version | head -n1 | awk '{ print $3 }' | cut -d '.' -f1)" -lt "5" ]]; then
   echo "we do not support g++<5, try to activate devtoolset-8 env"
   source /opt/rh/devtoolset-8/enable && echo "devtoolset-8 activated" \
     || ( echo "source devtoolset-8 failed, ignore this info if you have set env TOOLCHAIN_ROOT, TARGET_C_COMPILER, TARGET_CXX_COMPILER properly (see more details in README.md)" && sleep 4 ) # I hope user will see it
diff --git a/samples/mlu-ops/fault_sample/build.sh b/samples/mlu-ops/fault_sample/build.sh
index 787249233..95e635c8e 100755
--- a/samples/mlu-ops/fault_sample/build.sh
+++ b/samples/mlu-ops/fault_sample/build.sh
@@ -9,7 +9,7 @@ if [ "$OS_RELEASE_ID" = "centos" -a "$OS_RELEASE_VERSION_ID" = "7" ]; then
   fi
 fi
 
-if [[ "$(g++ --version | head -n1 | awk '{ print $3 }' | cut -d '.' -f1)" < "5" ]]; then
+if [[ "$(g++ --version | head -n1 | awk '{ print $3 }' | cut -d '.' -f1)" -lt "5" ]]; then
   echo "we do not support g++<5, try to activate devtoolset-7 env"
   source /opt/rh/devtoolset-7/enable && echo "devtoolset-7 activated" \
     || ( echo "source devtoolset-7 failed, ignore this info if you have set env TOOLCHAIN_ROOT, TARGET_C_COMPILER, TARGET_CXX_COMPILER properly (see more details in README.md)" && sleep 4 ) # I hope user will see it
diff --git a/samples/mlu-ops/poly_nms_sample/build.sh b/samples/mlu-ops/poly_nms_sample/build.sh
index 787249233..95e635c8e 100755
--- a/samples/mlu-ops/poly_nms_sample/build.sh
+++ b/samples/mlu-ops/poly_nms_sample/build.sh
@@ -9,7 +9,7 @@ if [ "$OS_RELEASE_ID" = "centos" -a "$OS_RELEASE_VERSION_ID" = "7" ]; then
   fi
 fi
 
-if [[ "$(g++ --version | head -n1 | awk '{ print $3 }' | cut -d '.' -f1)" < "5" ]]; then
+if [[ "$(g++ --version | head -n1 | awk '{ print $3 }' | cut -d '.' -f1)" -lt "5" ]]; then
   echo "we do not support g++<5, try to activate devtoolset-7 env"
   source /opt/rh/devtoolset-7/enable && echo "devtoolset-7 activated" \
     || ( echo "source devtoolset-7 failed, ignore this info if you have set env TOOLCHAIN_ROOT, TARGET_C_COMPILER, TARGET_CXX_COMPILER properly (see more details in README.md)" && sleep 4 ) # I hope user will see it
diff --git a/scripts/gen_symbol_visibility_map.py b/scripts/gen_symbol_visibility_map.py
index 22eb24d98..8e26114f4 100644
--- a/scripts/gen_symbol_visibility_map.py
+++ b/scripts/gen_symbol_visibility_map.py
@@ -6,12 +6,18 @@
 def get_mluops(input_file):
     ops_str=""
     pattern = re.compile(r'(?P<api>mluOp\w+) *\(')
+    pattern_lite = re.compile(r'(?P<api>mluApply\w+) *\(')
     with open(input_file,'r', encoding='utf8') as f:
         for line in f:
             match = pattern.search(line)
+            lite_match = pattern_lite.search(line)
             if match:
                 op = match.groupdict()['api'] + ';'
                 ops_str += op
+
+            if lite_match:
+                op = lite_match.groupdict()['api'] + '*;'
+                ops_str += '*' + op
     return ops_str
 
 def create_map_file(map_file,ops_str):
diff --git a/test/mlu_op_gtest/CMakeLists.txt b/test/mlu_op_gtest/CMakeLists.txt
index 1ffb3c3d3..225f625be 100644
--- a/test/mlu_op_gtest/CMakeLists.txt
+++ b/test/mlu_op_gtest/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.5)
 cmake_policy(SET CMP0048 NEW) # Use project(... VERSION ...)
 project(mlu_op_test)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 # check
@@ -155,7 +155,7 @@ add_library(gtest_shared STATIC ${SRC_DIR}) # for runtime convenience
 # #target_link_libraries(gen_half2float_table cnrt)
 add_custom_command(
  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/gen_half2float_table
- COMMAND ${BANG_CNCC_EXECUTABLE} -mavx2 -mf16c -std=c++14 -I ${CMAKE_CURRENT_SOURCE_DIR}/include -I ${NEUWARE_HOME}/include ${CMAKE_CURRENT_SOURCE_DIR}/tools/gen_half2float_table.cpp -o ${CMAKE_CURRENT_BINARY_DIR}/gen_half2float_table -L ${NEUWARE_HOME}/lib64 -lcnrt -lm -lstdc++ -Wl,-rpath=${NEUWARE_HOME}/lib64
+ COMMAND ${BANG_CNCC_EXECUTABLE} -mavx2 -mf16c -std=c++17 -I ${CMAKE_CURRENT_SOURCE_DIR}/include -I ${NEUWARE_HOME}/include ${CMAKE_CURRENT_SOURCE_DIR}/tools/gen_half2float_table.cpp -o ${CMAKE_CURRENT_BINARY_DIR}/gen_half2float_table -L ${NEUWARE_HOME}/lib64 -lcnrt -lm -lstdc++ -Wl,-rpath=${NEUWARE_HOME}/lib64
  DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/tools/gen_half2float_table.cpp ${CMAKE_CURRENT_SOURCE_DIR}/include/math_half.h
 )
 add_custom_command(
diff --git a/test/mlu_op_gtest/pb_gtest/include/executor.h b/test/mlu_op_gtest/pb_gtest/include/executor.h
index 9adb5416c..12edb8451 100644
--- a/test/mlu_op_gtest/pb_gtest/include/executor.h
+++ b/test/mlu_op_gtest/pb_gtest/include/executor.h
@@ -39,6 +39,8 @@
 #include <unordered_set>
 #include "gtest/gtest.h"
 #include "mlu_op.h"
+#include "bangc_helper_dtype.h"
+#include "bangc_kernels.h"
 #include "core/tensor.h"
 #include "core/tool.h"
 #include "core/type.h"
@@ -138,6 +140,7 @@ struct ExecuteConfig {
   bool random_mlu_address = false;
   bool enable_const_dram = false;
   bool auto_tuning = false;
+  bool enable_lite_interface = getEnv("MLUOP_GTEST_INTERFACE_MODE", 0) == 1;
 // #if GTEST_ENABLE_GPERFTOOLS
 //   // TODO(None) move into global_var
 //   bool gtest_internal_cpu_profile =
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp
index b0f6f229f..8594c6f1d 100644
--- a/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp
@@ -38,7 +38,6 @@ void AdamWExecutor::paramCheck() {
 }
 
 void AdamWExecutor::compute() {
-  VLOG(4) << "AdamWExecutor compute ";
   auto desc_param = tensor_desc_[0].tensor;
   auto desc_paramh = tensor_desc_[1].tensor;
   auto desc_momentum = tensor_desc_[2].tensor;
@@ -62,25 +61,37 @@ void AdamWExecutor::compute() {
   const float fp32_scale = parser_->getProtoNode()->adamw_param().scale();
   bool use_nesterov = parser_->getProtoNode()->adamw_param().use_nesterov();
 
-  mluOpAdamWDescriptor_t adamw_desc;
-  MLUOP_CHECK(mluOpCreateAdamWDescriptor(&adamw_desc));
-  MLUOP_CHECK(mluOpSetAdamWDescAttr(adamw_desc, MLUOP_ADAMW_WEIGHT_DECAY,
-                                    &fp32_weight_decay,
-                                    sizeof(fp32_weight_decay)));
-  MLUOP_CHECK(mluOpSetAdamWDescAttr(adamw_desc, MLUOP_ADAMW_GRAD_SCALE,
-                                    &fp32_scale, sizeof(fp32_scale)));
-  MLUOP_CHECK(mluOpSetAdamWDescAttr(adamw_desc, MLUOP_ADAMW_USE_NESTEROV,
-                                    &use_nesterov, sizeof(use_nesterov)));
-
-  VLOG(4) << "call mluOpAdamw()";
-  interface_timer_.start();
-  MLUOP_CHECK(mluOpAdamW(handle_, adamw_desc, desc_param, dev_param,
-                         desc_paramh, dev_paramh, desc_momentum, dev_momentum,
-                         desc_velocity, dev_velocity, desc_grad, dev_grad,
-                         fp32_lr, fp32_beta1, fp32_beta2, fp32_bias1,
-                         fp32_bias2, fp32_epsilon));
-  interface_timer_.stop();
-  MLUOP_CHECK(mluOpDestroyAdamWDescriptor(adamw_desc));
+  if (!exe_config_->enable_lite_interface) {
+    VLOG(4) << "call mluOpAdamw. ";
+    mluOpAdamWDescriptor_t adamw_desc;
+    MLUOP_CHECK(mluOpCreateAdamWDescriptor(&adamw_desc));
+    MLUOP_CHECK(mluOpSetAdamWDescAttr(adamw_desc, MLUOP_ADAMW_WEIGHT_DECAY,
+                                      &fp32_weight_decay,
+                                      sizeof(fp32_weight_decay)));
+    MLUOP_CHECK(mluOpSetAdamWDescAttr(adamw_desc, MLUOP_ADAMW_GRAD_SCALE,
+                                      &fp32_scale, sizeof(fp32_scale)));
+    MLUOP_CHECK(mluOpSetAdamWDescAttr(adamw_desc, MLUOP_ADAMW_USE_NESTEROV,
+                                      &use_nesterov, sizeof(use_nesterov)));
+    interface_timer_.start();
+    MLUOP_CHECK(mluOpAdamW(handle_, adamw_desc, desc_param, dev_param,
+                           desc_paramh, dev_paramh, desc_momentum, dev_momentum,
+                           desc_velocity, dev_velocity, desc_grad, dev_grad,
+                           fp32_lr, fp32_beta1, fp32_beta2, fp32_bias1,
+                           fp32_bias2, fp32_epsilon));
+    interface_timer_.stop();
+    MLUOP_CHECK(mluOpDestroyAdamWDescriptor(adamw_desc));
+  } else {
+    VLOG(4) << "call mluApplyAdamW. ";
+    const int size = mluOpGetTensorElementNum(desc_momentum) * sizeof(float);
+    interface_timer_.start();
+    const auto adamw_status = bangc_kernels::mluApplyAdamW(
+        handle_->queue, fp32_lr, fp32_beta1, fp32_beta2, fp32_bias1, fp32_bias2,
+        fp32_epsilon, fp32_weight_decay, fp32_scale, use_nesterov, size,
+        BANG_WRAP_T((Eigen::bfloat16 *)dev_paramh),
+        BANG_WRAP_T((Eigen::bfloat16 *)dev_grad), dev_param, dev_momentum,
+        dev_velocity);
+    interface_timer_.stop();
+  }
 }
 
 void AdamWExecutor::setMiscellaneousParam() {
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.h b/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.h
index 79cb82bf9..66388bbfb 100644
--- a/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.h
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.h
@@ -22,6 +22,7 @@
  *************************************************************************/
 #ifndef TEST_MLU_OP_GTEST_SRC_ZOO_ADAMW_ADAMW_H_
 #define TEST_MLU_OP_GTEST_SRC_ZOO_ADAMW_ADAMW_H_
+
 #include "executor.h"
 
 namespace mluoptest {

From c95249199ff57f4b38a70b250c6761206add592c Mon Sep 17 00:00:00 2001
From: wangyuan <wangyuan@cambricon.com>
Date: Mon, 23 Dec 2024 16:18:51 +0800
Subject: [PATCH 2/5] [Feature](mlu-ops): add mluAdamW.

---
 bangc_kernels.h                               | 33 +++++++++----------
 kernels/adam_w/adam_w.cpp                     | 11 +++----
 kernels/adam_w/adam_w.h                       |  2 +-
 kernels/adam_w/adam_w_union1.mlu              | 14 ++++----
 scripts/gen_symbol_visibility_map.py          |  2 +-
 .../pb_gtest/src/zoo/adam_w/adam_w.cpp        |  4 +--
 6 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/bangc_kernels.h b/bangc_kernels.h
index 5ffbcde31..dea403f82 100644
--- a/bangc_kernels.h
+++ b/bangc_kernels.h
@@ -68,23 +68,22 @@ typedef enum {
 } bangcKernelsStatus_t;
 
 template <typename T>
-bangcKernelsStatus_t BANGC_KERNELS_WIN_API
-mluApplyAdamW(const cnrtQueue_t queue,
-              const float lr,
-              const float beta1,
-              const float beta2,
-              const float bias1,
-              const float bias2,
-              const float epsilon,
-              const float weight_decay,
-              const float scale,
-              const bool use_nesterov,
-              const size_t size,
-              T *param_h,
-              T *grad,
-              void *param,
-              void *momentum,
-              void *velocity);
+bangcKernelsStatus_t BANGC_KERNELS_WIN_API mluAdamW(const cnrtQueue_t queue,
+                                                    const float lr,
+                                                    const float beta1,
+                                                    const float beta2,
+                                                    const float bias1,
+                                                    const float bias2,
+                                                    const float epsilon,
+                                                    const float weight_decay,
+                                                    const float scale,
+                                                    const bool use_nesterov,
+                                                    const size_t size,
+                                                    T *param_h,
+                                                    T *grad,
+                                                    void *param,
+                                                    void *momentum,
+                                                    void *velocity);
 
 #ifndef NAMESPACE_BANGC_KERNELS_END
 #define NAMESPACE_BANGC_KERNELS_END }
diff --git a/kernels/adam_w/adam_w.cpp b/kernels/adam_w/adam_w.cpp
index d996fe3b3..ca9d88aba 100644
--- a/kernels/adam_w/adam_w.cpp
+++ b/kernels/adam_w/adam_w.cpp
@@ -267,12 +267,11 @@ mluOpAdamW(mluOpHandle_t handle, const mluOpAdamWDescriptor_t adamw_desc,
               << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
       CHECK_RETURN(
           "[mluOpAdamW]",
-          KernelApplyAdamW(k_dim, k_type, handle->queue, (void *)param,
-                           (void *)param_h, (void *)grad, (void *)momentum,
-                           (void *)velocity, lr, beta1, beta2, bias1, bias2,
-                           epsilon, adamw_desc->weight_decay,
-                           adamw_desc->grad_scale, adamw_desc->use_nesterov,
-                           size, k_data_type));
+          KernelApplyAdamW(
+              k_dim, k_type, handle->queue, (void *)param, (void *)param_h,
+              (void *)grad, (void *)momentum, (void *)velocity, lr, beta1,
+              beta2, bias1, bias2, epsilon, adamw_desc->weight_decay,
+              adamw_desc->grad_scale, adamw_desc->use_nesterov, size));
     }
   }
   GEN_CASE_END();
diff --git a/kernels/adam_w/adam_w.h b/kernels/adam_w/adam_w.h
index 567aac49f..15aa74821 100644
--- a/kernels/adam_w/adam_w.h
+++ b/kernels/adam_w/adam_w.h
@@ -38,6 +38,6 @@ mluOpStatus_t MLUOP_WIN_API KernelApplyAdamW(
     const cnrtQueue_t queue, void *param, void *param_h, void *grad,
     void *momentum, void *velocity, float lr, float beta1, float beta2,
     float bias1, float bias2, float epsilon, float weight_decay, float scale,
-    bool use_nesterov, size_t size, mluOpDataType_t k_data_type);
+    bool use_nesterov, size_t size);
 
 #endif  // KERNELS_ADAMW_ADAMW_H_
diff --git a/kernels/adam_w/adam_w_union1.mlu b/kernels/adam_w/adam_w_union1.mlu
index 2536b33e5..da031c7c3 100644
--- a/kernels/adam_w/adam_w_union1.mlu
+++ b/kernels/adam_w/adam_w_union1.mlu
@@ -228,7 +228,7 @@ mluOpStatus_t MLUOP_WIN_API KernelApplyAdamW(
     const cnrtQueue_t queue, void *param, void *param_h, void *grad,
     void *momentum, void *velocity, float lr, float beta1, float beta2,
     float bias1, float bias2, float epsilon, float weight_decay, float scale,
-    bool use_nesterov, size_t size, mluOpDataType_t k_data_type) {
+    bool use_nesterov, size_t size) {
   // launch kernel
   unionApplyAdamW<bfloat16_t><<<k_dim, k_type, queue>>>(
       (bfloat16_t *)param_h, (bfloat16_t *)grad, (float *)param,
@@ -241,11 +241,11 @@ NAMESPACE_BANGC_KERNELS_GEGIN
 
 template <typename T>
 bangcKernelsStatus_t BANGC_KERNELS_WIN_API
-mluApplyAdamW(const cnrtQueue_t queue, const float lr, const float beta1,
-              const float beta2, const float bias1, const float bias2,
-              const float epsilon, const float weight_decay, const float scale,
-              const bool use_nesterov, size_t size, T *param_h, T *grad,
-              void *param, void *momentum, void *velocity) {
+mluAdamW(const cnrtQueue_t queue, const float lr, const float beta1,
+         const float beta2, const float bias1, const float bias2,
+         const float epsilon, const float weight_decay, const float scale,
+         const bool use_nesterov, size_t size, T *param_h, T *grad, void *param,
+         void *momentum, void *velocity) {
   // set job type
   int ordinal = -1;
   int cluster_num;
@@ -266,7 +266,7 @@ mluApplyAdamW(const cnrtQueue_t queue, const float lr, const float beta1,
 }
 
 #define IMPL_MLU_APPLY_ADAMW_KERNEL(DType)                                   \
-  template bangcKernelsStatus_t BANGC_KERNELS_WIN_API mluApplyAdamW(         \
+  template bangcKernelsStatus_t BANGC_KERNELS_WIN_API mluAdamW(              \
       const cnrtQueue_t, const float, const float, const float, const float, \
       const float, const float, const float, const float, const bool,        \
       const size_t, DType *, DType *, void *, void *, void *)
diff --git a/scripts/gen_symbol_visibility_map.py b/scripts/gen_symbol_visibility_map.py
index 8e26114f4..39abe86c6 100644
--- a/scripts/gen_symbol_visibility_map.py
+++ b/scripts/gen_symbol_visibility_map.py
@@ -6,7 +6,7 @@
 def get_mluops(input_file):
     ops_str=""
     pattern = re.compile(r'(?P<api>mluOp\w+) *\(')
-    pattern_lite = re.compile(r'(?P<api>mluApply\w+) *\(')
+    pattern_lite = re.compile(r'(?P<api>mlu\w+) *\(')
     with open(input_file,'r', encoding='utf8') as f:
         for line in f:
             match = pattern.search(line)
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp
index 8594c6f1d..906ea22f2 100644
--- a/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp
@@ -81,10 +81,10 @@ void AdamWExecutor::compute() {
     interface_timer_.stop();
     MLUOP_CHECK(mluOpDestroyAdamWDescriptor(adamw_desc));
   } else {
-    VLOG(4) << "call mluApplyAdamW. ";
+    VLOG(4) << "call mluAdamW. ";
     const int size = mluOpGetTensorElementNum(desc_momentum) * sizeof(float);
     interface_timer_.start();
-    const auto adamw_status = bangc_kernels::mluApplyAdamW(
+    const auto adamw_status = bangc_kernels::mluAdamW(
         handle_->queue, fp32_lr, fp32_beta1, fp32_beta2, fp32_bias1, fp32_bias2,
         fp32_epsilon, fp32_weight_decay, fp32_scale, use_nesterov, size,
         BANG_WRAP_T((Eigen::bfloat16 *)dev_paramh),

From a909ce7622ec1dd82c1f920510d1707385c944af Mon Sep 17 00:00:00 2001
From: wangyuan <wangyuan@cambricon.com>
Date: Mon, 23 Dec 2024 16:59:05 +0800
Subject: [PATCH 3/5] [Feature](mlu-ops): add mluAdamW.

---
 bangc_kernels.h                  | 6 +++---
 kernels/adam_w/adam_w_union1.mlu | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bangc_kernels.h b/bangc_kernels.h
index dea403f82..049d0faff 100644
--- a/bangc_kernels.h
+++ b/bangc_kernels.h
@@ -23,11 +23,11 @@
 #ifndef BANGC_KERNELS_H_
 #define BANGC_KERNELS_H_
 
-#ifndef NAMESPACE_BANGC_KERNELS_GEGIN
-#define NAMESPACE_BANGC_KERNELS_GEGIN namespace bangc_kernels {
+#ifndef NAMESPACE_BANGC_KERNELS_BEGIN
+#define NAMESPACE_BANGC_KERNELS_BEGIN namespace bangc_kernels {
 #endif
 
-NAMESPACE_BANGC_KERNELS_GEGIN
+NAMESPACE_BANGC_KERNELS_BEGIN
 
 #ifndef BANGC_KERNELS_WIN_API
 #ifdef _WIN32
diff --git a/kernels/adam_w/adam_w_union1.mlu b/kernels/adam_w/adam_w_union1.mlu
index da031c7c3..e306a1805 100644
--- a/kernels/adam_w/adam_w_union1.mlu
+++ b/kernels/adam_w/adam_w_union1.mlu
@@ -237,7 +237,7 @@ mluOpStatus_t MLUOP_WIN_API KernelApplyAdamW(
   return MLUOP_STATUS_SUCCESS;
 }
 
-NAMESPACE_BANGC_KERNELS_GEGIN
+NAMESPACE_BANGC_KERNELS_BEGIN
 
 template <typename T>
 bangcKernelsStatus_t BANGC_KERNELS_WIN_API

From 50bebcca89831db8f8b341675e5659b3b0524176 Mon Sep 17 00:00:00 2001
From: wangyuan <wangyuan@cambricon.com>
Date: Mon, 23 Dec 2024 17:05:12 +0800
Subject: [PATCH 4/5] [Feature](mlu-ops): add mluAdamW.

---
 bangc_kernels.h                                      | 6 +++---
 test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/bangc_kernels.h b/bangc_kernels.h
index 049d0faff..3ea8f70de 100644
--- a/bangc_kernels.h
+++ b/bangc_kernels.h
@@ -81,9 +81,9 @@ bangcKernelsStatus_t BANGC_KERNELS_WIN_API mluAdamW(const cnrtQueue_t queue,
                                                     const size_t size,
                                                     T *param_h,
                                                     T *grad,
-                                                    void *param,
-                                                    void *momentum,
-                                                    void *velocity);
+                                                    float *param,
+                                                    float *momentum,
+                                                    float *velocity);
 
 #ifndef NAMESPACE_BANGC_KERNELS_END
 #define NAMESPACE_BANGC_KERNELS_END }
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp
index 906ea22f2..073216c67 100644
--- a/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp
@@ -88,8 +88,8 @@ void AdamWExecutor::compute() {
         handle_->queue, fp32_lr, fp32_beta1, fp32_beta2, fp32_bias1, fp32_bias2,
         fp32_epsilon, fp32_weight_decay, fp32_scale, use_nesterov, size,
         BANG_WRAP_T((Eigen::bfloat16 *)dev_paramh),
-        BANG_WRAP_T((Eigen::bfloat16 *)dev_grad), dev_param, dev_momentum,
-        dev_velocity);
+        BANG_WRAP_T((Eigen::bfloat16 *)dev_grad), (float *)dev_param,
+        (float *)dev_momentum, (float *)dev_velocity);
     interface_timer_.stop();
   }
 }

From 9154a65dbee41ae70802f6c218f4166a74928850 Mon Sep 17 00:00:00 2001
From: wangyuan <wangyuan@cambricon.com>
Date: Mon, 23 Dec 2024 17:37:33 +0800
Subject: [PATCH 5/5] [Feature](mlu-ops): add mluAdamW.

---
 bangc_kernels.h                  | 72 ++++++++++++++++++++++++++++++++
 kernels/adam_w/adam_w_union1.mlu | 12 +++---
 2 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/bangc_kernels.h b/bangc_kernels.h
index 3ea8f70de..7ffdf84bf 100644
--- a/bangc_kernels.h
+++ b/bangc_kernels.h
@@ -67,6 +67,78 @@ typedef enum {
      due to large scale or inappropriate range of value of input tensor. */
 } bangcKernelsStatus_t;
 
+
+// Group: AdamW
+/*!
+ * @brief Updates each attribute by using AdamW.
+ *
+ * @param[in] queue
+ * A pointer to the cnrtQueue struct holding the information about a queue.
+ * @param[in] lr
+ * A hyperparameter representing the learning rate.
+ * @param[in] beta1
+ * A hyperparameter for updating momentum.
+ * @param[in] beta2
+ * A hyperparameter for updating velocity.
+ * @param[in] bias1
+ * A hyperparameter for updating param.
+ * @param[in] bias2
+ * A hyperparameter for updating param.
+ * @param[in] epsilon
+ * A fraction that prevents the denominator from being zero.
+ * @param[in] weight_decay
+ * A hyperparameter representing weight decay.
+ * @param[in] scale
+ * A hyperparameter of a shrinking gradient.
+ * @param[in] use_nesterov
+ * A parameter that determines whether to use the NAG algorithm.
+ * @param[in] size
+ * A parameter that represents the amount of data for the parameter.
+ * @param[in] param_h
+ * Pointer to the MLU memory that stores the param_h tensor.
+ * @param[in] grad
+ * Pointer to the MLU memory that stores the grad tensor.
+ * @param[in] param
+ * Pointer to the MLU memory that stores the grad tensor.
+ * @param[in] momentum
+ * Pointer to the MLU memory that stores the grad tensor.
+ * @param[in] velocity
+ * Pointer to the MLU memory that stores the grad tensor.
+ * @par Return
+ * - ::BANGC_KERNELS_STATUS_SUCCESS
+ *
+ * @par Data Type
+ * - The supported data types of input and output tensors are as follows:
+ *   - param_h tensor: bfloat16
+ *   - grad tensor: bfloat16
+ *   - param tensor: float
+ *   - momentum tensor: float
+ *   - velocity tensor: float
+ * 
+ * @par Data Layout
+ * - The supported data layouts of \b param tensor, \b param_h tensor, \b momentum tensor, \b velocity tensor, and \b
+ *   grad tensor are as follows:
+ *   - param tensor: ARRAY
+ *   - param_h tensor: ARRAY
+ *   - momentum tensor: ARRAY
+ *   - velocity tensor: ARRAY
+ *   - grad tensor: ARRAY
+ * 
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par Note
+ * - None.
+ *
+ * @par Example
+ * - None.
+ *
+ * @par Reference
+ * - https://github.com/OpenBMB/BMTrain/blob/6abcf772aa1e120192f7656e55c4adbcde53c886/csrc/cuda/adam_cuda.cu
+ */
 template <typename T>
 bangcKernelsStatus_t BANGC_KERNELS_WIN_API mluAdamW(const cnrtQueue_t queue,
                                                     const float lr,
diff --git a/kernels/adam_w/adam_w_union1.mlu b/kernels/adam_w/adam_w_union1.mlu
index e306a1805..7d536ad3d 100644
--- a/kernels/adam_w/adam_w_union1.mlu
+++ b/kernels/adam_w/adam_w_union1.mlu
@@ -244,8 +244,8 @@ bangcKernelsStatus_t BANGC_KERNELS_WIN_API
 mluAdamW(const cnrtQueue_t queue, const float lr, const float beta1,
          const float beta2, const float bias1, const float bias2,
          const float epsilon, const float weight_decay, const float scale,
-         const bool use_nesterov, size_t size, T *param_h, T *grad, void *param,
-         void *momentum, void *velocity) {
+         const bool use_nesterov, size_t size, T *param_h, T *grad,
+         float *param, float *momentum, float *velocity) {
   // set job type
   int ordinal = -1;
   int cluster_num;
@@ -259,9 +259,9 @@ mluAdamW(const cnrtQueue_t queue, const float lr, const float beta1,
 
   // launch kernel
   unionApplyAdamW<<<k_dim, k_type, queue>>>(
-      BANG_UNWRAP_T(param_h), BANG_UNWRAP_T(grad), (float *)param,
-      (float *)momentum, (float *)velocity, lr, beta1, beta2, bias1, bias2,
-      epsilon, weight_decay, scale, use_nesterov, size);
+      BANG_UNWRAP_T(param_h), BANG_UNWRAP_T(grad), param, momentum, velocity,
+      lr, beta1, beta2, bias1, bias2, epsilon, weight_decay, scale,
+      use_nesterov, size);
   return BANGC_KERNELS_STATUS_SUCCESS;
 }
 
@@ -269,7 +269,7 @@ mluAdamW(const cnrtQueue_t queue, const float lr, const float beta1,
   template bangcKernelsStatus_t BANGC_KERNELS_WIN_API mluAdamW(              \
       const cnrtQueue_t, const float, const float, const float, const float, \
       const float, const float, const float, const float, const bool,        \
-      const size_t, DType *, DType *, void *, void *, void *)
+      const size_t, DType *, DType *, float *, float *, float *)
 
 IMPL_MLU_APPLY_ADAMW_KERNEL(bang_bfloat16_t);