diff --git a/bangc-ops/kernels/kernel_wrapper/lib/libextops.a b/bangc-ops/kernels/kernel_wrapper/lib/libextops.a index 4500375a5c..8498a04ebc 100644 Binary files a/bangc-ops/kernels/kernel_wrapper/lib/libextops.a and b/bangc-ops/kernels/kernel_wrapper/lib/libextops.a differ diff --git a/bangc-ops/kernels/kernel_wrapper/wrapper.h b/bangc-ops/kernels/kernel_wrapper/wrapper.h index 1d2790a568..7eb651fd84 100644 --- a/bangc-ops/kernels/kernel_wrapper/wrapper.h +++ b/bangc-ops/kernels/kernel_wrapper/wrapper.h @@ -178,6 +178,73 @@ const mluOpTensorDescriptor_t, \ void * +#define SYNCBATCHNORMSTATS_PARAM_TYPE \ + mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, const float, \ + const mluOpTensorDescriptor_t, void *, const mluOpTensorDescriptor_t, \ + void * + +#define SYNCBATCHNORMSTATS_V2_PARAM_TYPE \ + mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, void *, size_t, \ + const float, const mluOpTensorDescriptor_t, void *, \ + const mluOpTensorDescriptor_t, void * + +#define SYNCBATCHNORMGATHERSTATSWITHCOUNTS_PARAM_TYPE \ + mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, void *, const mluOpTensorDescriptor_t, \ + void *, float, float, const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, void *, const mluOpTensorDescriptor_t, \ + void * + +#define SYNCBATCHNORMELEMT_PARAM_TYPE \ + mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, void * + +#define SYNCBATCHNORMBACKWADREDUCE_PARAM_TYPE \ + mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, void *, const mluOpTensorDescriptor_t, \ + void *, const mluOpTensorDescriptor_t, void *, \ + const mluOpTensorDescriptor_t, void *, const bool, const bool, \ + const bool + +#define SYNCBATCHNORMBACKWADREDUCE_V2_PARAM_TYPE \ + mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, void *, size_t, \ + const mluOpTensorDescriptor_t, void *, const mluOpTensorDescriptor_t, \ + void *, const mluOpTensorDescriptor_t, void *, \ + const mluOpTensorDescriptor_t, void *, const bool, const bool, \ + const bool + +#define SYNCBATCHNORMBACKWARDELEMT_PARAM_TYPE \ + mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, void * + +#define SYNCBATCHNORMBACKWARDELEMT_V2_PARAM_TYPE \ + mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, \ + const void *, const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, const void *, \ + const mluOpTensorDescriptor_t, void *diff_x + /* Kernel register */ KERNEL_REGISTER(addN, ADDN_PARAM_TYPE); KERNEL_REGISTER(addNV2, ADDNV2_PARAM_TYPE); @@ -203,4 +270,17 @@ KERNEL_REGISTER(RoiAlignBackward, ROIALIGNBACKWARD_PARAM_TYPE); KERNEL_REGISTER(RoiAlignBackwardV2, ROIALIGNBACKWARD_V2_PARAM_TYPE); KERNEL_REGISTER(RoiPoolingForward, ROIPOOLINGFORWARD_PARAM_TYPE); KERNEL_REGISTER(RoiPoolingBackward, ROIPOOLINGBACKWARD_PARAM_TYPE); +KERNEL_REGISTER(SyncBatchNormStats, SYNCBATCHNORMSTATS_PARAM_TYPE); +KERNEL_REGISTER(SyncBatchNormStatsV2, SYNCBATCHNORMSTATS_V2_PARAM_TYPE); +KERNEL_REGISTER(SyncBatchNormGatherStatsWithCounts, + SYNCBATCHNORMGATHERSTATSWITHCOUNTS_PARAM_TYPE); +KERNEL_REGISTER(SyncBatchNormElemt, SYNCBATCHNORMELEMT_PARAM_TYPE); +KERNEL_REGISTER(SyncBatchnormBackwardReduce, + SYNCBATCHNORMBACKWADREDUCE_PARAM_TYPE); +KERNEL_REGISTER(SyncBatchnormBackwardReduceV2, + SYNCBATCHNORMBACKWADREDUCE_V2_PARAM_TYPE); +KERNEL_REGISTER(SyncBatchNormBackwardElemt, + SYNCBATCHNORMBACKWARDELEMT_PARAM_TYPE); +KERNEL_REGISTER(SyncBatchNormBackwardElemtV2, + SYNCBATCHNORMBACKWARDELEMT_V2_PARAM_TYPE); #endif // KERNELS_KERNEL_WRAPPER_WRAPPER_H diff --git a/bangc-ops/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp b/bangc-ops/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp new file mode 100644 index 0000000000..73b31d5c6f --- /dev/null +++ b/bangc-ops/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp @@ -0,0 +1,49 @@ +/************************************************************************* + * Copyright (C) [2023] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "kernels/kernel_wrapper/wrapper.h" + +mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormBackwardElemt( + mluOpHandle_t handle, + const mluOpTensorDescriptor_t diff_y_desc, + const void *diff_y, + const mluOpTensorDescriptor_t x_desc, + const void *x, + const mluOpTensorDescriptor_t mean_desc, + const void *mean, + const mluOpTensorDescriptor_t invstd_desc, + const void *invstd, + const mluOpTensorDescriptor_t filter_desc, + const void *filter, + const mluOpTensorDescriptor_t mean_dy_desc, + const void *mean_dy, + const mluOpTensorDescriptor_t mean_dy_xmu_desc, + const void *mean_dy_xmu, + const mluOpTensorDescriptor_t diff_x_desc, + void *diff_x) { + SyncBatchNormBackwardElemtWrapper wrapper; + mluOpStatus_t ret = wrapper.invoke(handle, diff_y_desc, diff_y, x_desc, + x, mean_desc, mean, invstd_desc, invstd, filter_desc, filter, + mean_dy_desc, mean_dy, mean_dy_xmu_desc, mean_dy_xmu, diff_x_desc, + diff_x); + return ret; +} diff --git a/bangc-ops/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp b/bangc-ops/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp new file mode 100644 index 0000000000..f78d67f92f --- /dev/null +++ b/bangc-ops/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp @@ -0,0 +1,42 @@ +/************************************************************************* + * Copyright (C) [2023] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "kernels/kernel_wrapper/wrapper.h" + +mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormBackwardElemtV2( + mluOpHandle_t handle, const mluOpTensorDescriptor_t diff_y_desc, + const void *diff_y, const mluOpTensorDescriptor_t x_desc, const void *x, + const mluOpTensorDescriptor_t mean_desc, const void *mean, + const mluOpTensorDescriptor_t invstd_desc, const void *invstd, + const mluOpTensorDescriptor_t filter_desc, const void *filter, + const mluOpTensorDescriptor_t sum_dy_desc, const void *sum_dy, + const mluOpTensorDescriptor_t sum_dy_xmu_desc, const void *sum_dy_xmu, + const mluOpTensorDescriptor_t count_desc, const void *count, + const mluOpTensorDescriptor_t diff_x_desc, void *diff_x) { + SyncBatchNormBackwardElemtV2Wrapper wrapper; + mluOpStatus_t ret = wrapper.invoke( + handle, diff_y_desc, diff_y, x_desc, x, mean_desc, mean, invstd_desc, + invstd, filter_desc, filter, sum_dy_desc, sum_dy, sum_dy_xmu_desc, + sum_dy_xmu, count_desc, count, diff_x_desc, diff_x); + return ret; +} + diff --git a/bangc-ops/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/bangc-ops/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp new file mode 100644 index 0000000000..f94c8bd24c --- /dev/null +++ b/bangc-ops/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp @@ -0,0 +1,65 @@ +/************************************************************************* + * Copyright (C) [2023] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "kernels/kernel_wrapper/wrapper.h" + +mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce( + mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz, + const mluOpTensorDescriptor_t desc_x, const void *x, + const mluOpTensorDescriptor_t desc_mean, const void *mean, + const mluOpTensorDescriptor_t desc_invstd, const void *invstd, + const mluOpTensorDescriptor_t desc_dfilter, void *dfilter, + const mluOpTensorDescriptor_t desc_dbias, void *dbias, + const mluOpTensorDescriptor_t desc_sum_dy, void *sum_dy, + const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu, + const bool needs_input_grad0, const bool needs_input_grad1, + const bool needs_input_grad2) { + SyncBatchnormBackwardReduceWrapper wrapper; + mluOpStatus_t ret = + wrapper.invoke(handle, desc_dz, dz, desc_x, x, desc_mean, mean, + desc_invstd, invstd, desc_dfilter, dfilter, desc_dbias, + dbias, desc_sum_dy, sum_dy, desc_sum_dy_xmu, sum_dy_xmu, + needs_input_grad0, needs_input_grad1, needs_input_grad2); + return ret; +} + +mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce_v2( + mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz, + const mluOpTensorDescriptor_t desc_x, const void *x, + const mluOpTensorDescriptor_t desc_mean, const void *mean, + const mluOpTensorDescriptor_t desc_invstd, const void *invstd, + void *workspace, size_t workspace_size, + const mluOpTensorDescriptor_t desc_dfilter, void *dfilter, + const mluOpTensorDescriptor_t desc_dbias, void *dbias, + const mluOpTensorDescriptor_t desc_sum_dy, void *sum_dy, + const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu, + const bool needs_input_grad0, const bool needs_input_grad1, + const bool needs_input_grad2) { + SyncBatchnormBackwardReduceV2Wrapper wrapper; + mluOpStatus_t ret = wrapper.invoke( + handle, desc_dz, dz, desc_x, x, desc_mean, mean, desc_invstd, invstd, + workspace, workspace_size, desc_dfilter, dfilter, desc_dbias, dbias, + desc_sum_dy, sum_dy, desc_sum_dy_xmu, sum_dy_xmu, needs_input_grad0, + needs_input_grad1, needs_input_grad2); + return ret; +} + diff --git a/bangc-ops/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp b/bangc-ops/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp new file mode 100644 index 0000000000..3de59c38ff --- /dev/null +++ b/bangc-ops/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp @@ -0,0 +1,38 @@ +/************************************************************************* + * Copyright (C) [2023] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "kernels/kernel_wrapper/wrapper.h" + +mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormElemt( + mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x, + const mluOpTensorDescriptor_t mean_desc, const void *mean, + const mluOpTensorDescriptor_t invstd_desc, const void *invstd, + const mluOpTensorDescriptor_t filter_desc, const void *filter, + const mluOpTensorDescriptor_t bias_desc, const void *bias, + const mluOpTensorDescriptor_t y_desc, void *y) { + SyncBatchNormElemtWrapper wrapper; + mluOpStatus_t ret = + wrapper.invoke(handle, x_desc, x, mean_desc, mean, invstd_desc, invstd, + filter_desc, filter, bias_desc, bias, y_desc, y); + return ret; +} + diff --git a/bangc-ops/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp b/bangc-ops/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp new file mode 100644 index 0000000000..d99916aa31 --- /dev/null +++ b/bangc-ops/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp @@ -0,0 +1,50 @@ +/************************************************************************* + * Copyright (C) [2023] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "kernels/kernel_wrapper/wrapper.h" + +mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormGatherStatsWithCounts( + mluOpHandle_t handle, + const mluOpTensorDescriptor_t mean_all_desc, + const void *mean_all, + const mluOpTensorDescriptor_t invstd_all_desc, + const void *invstd_all, + const mluOpTensorDescriptor_t moving_mean_desc, + void *moving_mean, + const mluOpTensorDescriptor_t moving_var_desc, + void *moving_var, + float momentum, + float eps, + const mluOpTensorDescriptor_t count_all_desc, + const void *count_all, + const mluOpTensorDescriptor_t mean_desc, + void *mean, + const mluOpTensorDescriptor_t invstd_desc, + void *invstd) { + SyncBatchNormGatherStatsWithCountsWrapper wrapper; + mluOpStatus_t ret = wrapper.invoke(handle, mean_all_desc, mean_all, + invstd_all_desc, invstd_all, moving_mean_desc, moving_mean, + moving_var_desc, moving_var, momentum, eps, count_all_desc, + count_all, mean_desc, mean, invstd_desc, invstd); + return ret; +} + diff --git a/bangc-ops/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp b/bangc-ops/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp new file mode 100644 index 0000000000..64b7547af1 --- /dev/null +++ b/bangc-ops/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp @@ -0,0 +1,45 @@ +/************************************************************************* + * Copyright (C) [2023] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "kernels/kernel_wrapper/wrapper.h" + +mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormStats( + mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x, + const float eps, const mluOpTensorDescriptor_t mean_desc, void *mean, + const mluOpTensorDescriptor_t invstd_desc, void *invstd) { + SyncBatchNormStatsWrapper wrapper; + mluOpStatus_t ret = wrapper.invoke(handle, x_desc, x, eps, mean_desc, mean, + invstd_desc, invstd); + return ret; +} + +mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormStats_v2( + mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x, + void *workspace, size_t workspace_size, const float eps, + const mluOpTensorDescriptor_t mean_desc, void *mean, + const mluOpTensorDescriptor_t invstd_desc, void *invstd) { + SyncBatchNormStatsV2Wrapper wrapper; + mluOpStatus_t ret = + wrapper.invoke(handle, x_desc, x, workspace, workspace_size, eps, + mean_desc, mean, invstd_desc, invstd); + return ret; +} diff --git a/bangc-ops/mlu_op.h b/bangc-ops/mlu_op.h index 78343071e2..c0615e0975 100644 --- a/bangc-ops/mlu_op.h +++ b/bangc-ops/mlu_op.h @@ -13980,6 +13980,1120 @@ mluOpRoiPoolingBackward(mluOpHandle_t handle, const float spatial_scale, const mluOpTensorDescriptor_t grads_image_desc, void *grads_image); + +// Group:SyncBatchNormStats +/*! + * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra + * workspace to optimize the sync_batchnorm_stats operation. + * + * The size of extra workspace is based on the given information of the sync_batchnorm_stats + * operation, including the input tensor descriptor \b x_desc. + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * sync_batchnorm_stats operation. For detailed information, see ::mluOpHandle_t. + * @param[in] input_desc + * The descriptor of the input tensor. For detailed information, + * see ::mluOpTensorDescriptor_t. + * @param[out] workspace_size + * Pointer to the returned size of the extra workspace in bytes that is used in the + * sync_batchnorm_stats operation. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - None. + * + * @par Data Layout + * - None. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par Note + * - This API is only used along with ::mluOpSyncBatchNormStats_v2. + * - The ::mluOpSyncBatchNormStats does not require this API. + * + * @par Example + * - None. + * + * @par Reference + * - None. + */ +mluOpStatus_t MLUOP_WIN_API +mluOpGetSyncBatchNormStatsWorkspaceSize(mluOpHandle_t handle, + const mluOpTensorDescriptor_t x_desc, + size_t *workspace_size); + +// Group:SyncBatchNormStats +/*! + * @brief Computes the local mean and the local inverse standard deviation for each channel + * across a batch of data in the training scenario. + * + * mluOpSyncBatchNormStats_v2 is used in convolution network, including but not limited to + * ResNet (Deep Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features). + * + * Compared with ::mluOpSyncBatchNormStats, this function allows you to allocate some extra + * workspace as an input parameter. If you just set \b workspace to NULL and \b workspace_size + * to 0, this function will perform as same as ::mluOpSyncBatchNormStats. + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * sync_batchnorm_stats operation. For detailed information, see ::mluOpHandle_t. + * @param[in] x_desc + * The descriptor of the input tensor \b x. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] x + * Pointer to the MLU memory that stores the input tensor \b x. + * @param[in] workspace + * Pointer to the MLU memory that is used as an extra workspace for the + * ::mluOpSyncBatchNormStats_v2. + * @param[in] workspace_size + * The size of the extra workspace in bytes that needs to be used in + * the ::mluOpSyncBatchNormStats_v2. You can get the size of the workspace with + * the ::mluOpGetSyncBatchNormStatsWorkspaceSize function. + * @param[in] eps + * A floating-point value added to the denominator for numerical stability. + * @param[in] mean_desc + * The descriptor of the output tensor \b mean. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] mean + * Pointer to the MLU memory that stores the output tensor \b mean, which is the + * local mean. + * @param[in] invstd_desc + * The descriptor of the output tensor \b invstd. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] invstd + * Pointer to the MLU memory that stores the output tensor \b invstd, which is the + * local inverse standard deviation. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - The supported combinations of data types are shown below with the following order: + * - float(x) - float(eps) - float(mean) - float(invstd). + * - half(x) - float(eps) - float(mean) - float(invstd). + * + * @par Data Layout + * - The supported data layout of the input tensor is shown as follows: + * - x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC. + * - The layout of the output tensors are shown as follows: + * - mean tensor: \p MLUOP_LAYOUT_ARRAY. + * - invstd tensor: \p MLUOP_LAYOUT_ARRAY. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - Before calling this function to perform ::mluOpSyncBatchNormStats_v2, you need to get + * the size of workspace by ::mluOpGetSyncBatchNormStatsWorkspaceSize. + * + * @par note + * - None. + * + * @par Example + * - The example of the sync_batch_norm_stats operation is as follows: + @verbatim + input five arrays by 1 * 2 * 3 * 2 + --> x: [[[[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]], + [[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]]]] + param: + eps: 0.00001 + output an array by 2 + --> mean: [1.0, 1.0] + --> invstd: [316.221, 316.221] + @endverbatim + * + * @par Reference + * - https://pytorch.org/docs/1.6.0/jit_builtin_functions.html?highlight=batch_norm_stats + * + */ +mluOpStatus_t MLUOP_WIN_API +mluOpSyncBatchNormStats_v2(mluOpHandle_t handle, + const mluOpTensorDescriptor_t x_desc, + const void *x, + void *workspace, + size_t workspace_size, + const float eps, + const mluOpTensorDescriptor_t mean_desc, + void *mean, + const mluOpTensorDescriptor_t invstd_desc, + void *invstd); + +// Group:SyncBatchNormStats +/*! + * @brief Computes the local mean and the local inverse standard deviation for each channel + * across a batch of data in the training scenario. + * + * SyncBatchnormStats is used in CNN, including but not limited to + * ResNet (Deep Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features). + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * SyncBatchnormStats operation. For detailed information, see ::mluOpHandle_t. + * @param[in] x_desc + * The descriptor of the input tensor \b x. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] x + * Pointer to the MLU memory that stores the input tensor \b x. + * @param[in] eps + * A floating-point value added to the denominator for numerical stability. + * @param[in] mean_desc + * The descriptor of the output tensor \b mean. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] mean + * Pointer to the MLU memory that stores the output tensor \b mean, which is the + * local mean. + * @param[in] invstd_desc + * The descriptor of the output tensor \b invstd. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] invstd + * Pointer to the MLU memory that stores the output tensor \b invstd, which is the + * local inverse standard deviation. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - The supported combinations of data types are shown below with the following order: + * - \b x - \b eps - \b mean - \b invstd + * - The supported data type combinations are: + * - float - float - float - float. + * - half - float - float - float. + * + * @par Data Layout + * - The supported data layout of the input tensor is shown as following: + * - x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC. + * - The layout of the output tensors are shown as following: + * - mean tensor: \p MLUOP_LAYOUT_ARRAY. + * - invstd tensor: \p MLUOP_LAYOUT_ARRAY. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par note + * - None. + * + * @par Example + * - The example of the sync_batch_norm_stats operation is as follows: + @verbatim + input five arrays by 1 * 2 * 3 * 2 + --> x: [[[[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]], + [[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]]]] + param: + eps: 0.00001 + output an array by 2 + --> mean: [1.0, 1.0] + --> invstd: [316.221, 316.221] + @endverbatim + * + * @par Reference + * - https://pytorch.org/docs/1.6.0/jit_builtin_functions.html?highlight=batch_norm_stats + * + */ +mluOpStatus_t MLUOP_WIN_API +mluOpSyncBatchNormStats(mluOpHandle_t handle, + const mluOpTensorDescriptor_t x_desc, + const void *x, + const float eps, + const mluOpTensorDescriptor_t mean_desc, + void *mean, + const mluOpTensorDescriptor_t invstd_desc, + void *invstd); + +// Group:SyncBatchNormGatherStatsWithCounts +/*! + * @brief Computes the global mean and the global inverse standard deviation across aggragation + * of the local mean and local inverse standard deviation of multiple MLU devices. + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * sync_batch_norm_gather_stats_with_counts operation. For detailed information, + * see ::mluOpHandle_t. + * @param[in] mean_all_desc + * The descriptor of the input tensor \b mean_all. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] mean_all + * Pointer to the MLU memory that stores the input tensor tensor \b mean_all, which is + * the local mean of multiple MLU devices. + * @param[in] invstd_all_desc + * The descriptor of the input tensor \b invstd_all. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] invstd_all + * Pointer to the MLU memory that stores the input tensor tensor \n invstd_all, which + * is the local inverse standard deviation of multiple MLU devices. + * @param[in] moving_mean_desc + * The descriptor of the input tensor \b moving_mean. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in,out] moving_mean + * Pointer to the MLU memory that stores the input tensor \b moving_mean, + * which is the moving average of mean computed over the dimensions of the input tensor + * \b mean_all. The value of this pointer can be NULL. + * @param[in] moving_var_desc + * The descriptor of the input tensor \b moving_var. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in,out] moving_var + * Pointer to the MLU memory that stores the tensor \b moving_var, which is + * the moving average of inverse standard deviation computed over the dimensions of the input + * tensor \b invstd_all. The value of this pointer can be NULL. + * @param[in] momentum + * A floating-point value used to do moving average of \b moving_mean and \b moving_var. + * @param[in] eps + * A floating-point value added to the denominator for numerical stability. + * @param[in] count_all_desc + * The descriptor of the input tensor \b count_all. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] count_all + * Pointer to the MLU memory that stores an array, which stores the total size of + * dimensions (except C dimension) of input for each MLU device. + * @param[in] mean_desc + * The descriptor of the output tensor \b mean. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] mean + * Pointer to the MLU memory that stores the output tensor \b mean, which is the + * global mean. + * @param[in] invstd_desc + * The descriptor of the output tensor \b invstd. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] invstd + * Pointer to the MLU memory that stores the output tensor \b invstd, which is the + * global inverse standard deviation. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - The supported combinations of data types are shown as the following order: + * - mean_all - invstd_all - moving_mean - moving_var - momentum - eps - count_all - mean - invstd + * - float - float - float - float - float - float - float - float - float. + * - float - float - half - half - float - float - half - float - float. + * + * @par Data Layout + * - The supported data layout of the input tensors are shown as the following: + * - mean_all tensor: \p MLUOP_LAYOUT_NC. + * - invstd_all tensor: \p MLUOP_LAYOUT_NC. + * - moving_mean tensor: \p MLUOP_LAYOUT_ARRAY. + * - moving_var tensor: \p MLUOP_LAYOUT_ARRAY. + * - momentum: Scalar. + * - eps: Scalar. + * - count_all tensor: \p MLUOP_LAYOUT_ARRAY. + * - The layout of the output tensors are shown as the following: + * - mean tensor: \p MLUOP_LAYOUT_ARRAY. + * - invstd tensor: \p MLUOP_LAYOUT_ARRAY. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par note + * - The input \b mean_all and the input \b invstd_all cannot be positive infinity or negative infinity + * at the same time on MLU300 series or above. + * + * @par Example + * - The example of the sync_batch_norm_gather_stats_with_counts operation is as follows: + @verbatim + --> mean_all: an array [8, 1024]; + --> invstd_all: an array [8, 1024]; + --> moving_mean: an array [1024]; + --> moving_var: an array [1024]; + --> count_all: an array [8]; + param: + --> momentum: 0.1 + --> eps: 0.00001 + output: + --> mean: an array [1024]; + --> invstd: [1024]; + @endverbatim + * + * @par Reference + * - https://pytorch.org/docs/1.6.0/jit_builtin_functions.html?highlight=batch_norm_stats + * + */ +mluOpStatus_t MLUOP_WIN_API +mluOpSyncBatchNormGatherStatsWithCounts(mluOpHandle_t handle, + const mluOpTensorDescriptor_t mean_all_desc, + const void *mean_all, + const mluOpTensorDescriptor_t invstd_all_desc, + const void *invstd_all, + const mluOpTensorDescriptor_t moving_mean_desc, + void *moving_mean, + const mluOpTensorDescriptor_t moving_var_desc, + void *moving_var, + float momentum, + float eps, + const mluOpTensorDescriptor_t count_all_desc, + const void *count_all, + const mluOpTensorDescriptor_t mean_desc, + void *mean, + const mluOpTensorDescriptor_t invstd_desc, + void *invstd); + +// Group:SyncBatchNormElemt +/*! + * @brief Applies Batch Normalization for each channel across a batch of data with the given mean, + * inverse variance and scaling factors. + * + * Batch Normalization is used in artificial intelligence, including but not limited to + * ResNet (Deep Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features). + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the sync batchnorm + * element operation. For detailed information, see ::mluOpHandle_t. + * @param[in] x_desc + * The descriptor of the input tensor \b x. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] x + * Pointer to the MLU memory that stores the input tensor \b x. + * @param[in] mean_desc + * The descriptor of \b mean tensor. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] mean + * Pointer to the MLU memory that stores the tensor \b mean, which is computed over the + * batch and spatial dimensions by ::mluOpSyncBatchNormGatherStatsWithCounts. + * @param[in] invstd_desc + * The descriptor of \b invstd tensor. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] invstd + * Pointer to the MLU memory that stores the tensor \b invstd, which is the inverse variance + * computed over the batch and spatial dimensions by ::mluOpSyncBatchNormGatherStatsWithCounts. + * @param[in] filter_desc + * The descriptor of \b filter tensor. For detailed information, see ::mluOpTensorDescriptor_t. + * The descriptor can be NULL when \b filter pointer is NULL. + * @param[in] filter + * Pointer to the MLU memory that stores the input tensor \b filter for affine transformation + * after batch normilization. The value of this pointer can be NULL. + * @param[in] bias_desc + * The descriptor of \b bias tensor. For detailed information, see ::mluOpTensorDescriptor_t. + * The descriptor can be NULL when \b bias pointer is NULL. + * @param[in] bias + * Pointer to the MLU memory that stores the input tensor \b bias for affine transformation + * after batch normalization. The value of this pointer can be NULL. + * @param[in] y_desc + * The descriptor of the sync batch normalization output tensor \b y. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] y + * Pointer to the MLU memory that stores the output tensor \b y. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - The supported combinations of data types are shown below with the following order: + * - x_tensor - mean_tensor - invstd_tensor - filter_tensor - bias_tensor - y_tensor + * - float - float - float - float - float - float. + * - half - float - float - float - float - half. + * + * @par Data Layout + * - The supported data layout of \b x, \b mean, \b invstd, \b filter, \b bias and \b y are as follows: + * - x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC. + * - mean tensor: \p MLUOP_LAYOUT_ARRAY. + * - invstd tensor: \p MLUOP_LAYOUT_ARRAY. + * - filter tensor: \p MLUOP_LAYOUT_ARRAY. + * - bias tensor: \p MLUOP_LAYOUT_ARRAY. + * - y tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC. + * The layout of the \b y should be the same as \b x tensor. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par note + * - The \b mean, \b invstd, \b filter and \b \b bias must be 1D tensors and the length of their dimensions + * should be the same as the the length of the lowest dimension of \b x. + * - The length of each dimension of \b x and \b y must be the same. + * + * @par Example + * - The example of the sync batchnorm element operation is as follows: + @verbatim + input five arrays by 1 * 2 * 3 * 2, 2, 2, 2 and 2 + --> x: [[[[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]], + [[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]]]] + + --> mean: [0.5, 0.5] + + --> invstd: [2.0, 2.0] + + --> filter: [0.5, 0.5] + + --> bias: [1.0, 1.0] + + output array by 1 * 2 * 3 * 2 + --> y: [[[[1.5, 1.5],[1.5, 1.5],[1.5, 1.5]], + [[1.5, 1.5],[1.5, 1.5],[1.5, 1.5]]]] + @endverbatim + * + * @par Reference + * - Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, + * Sergey Ioffe, 2015. + * + */ +mluOpStatus_t MLUOP_WIN_API +mluOpSyncBatchNormElemt(mluOpHandle_t handle, + const mluOpTensorDescriptor_t x_desc, + const void *x, + const mluOpTensorDescriptor_t mean_desc, + const void *mean, + const mluOpTensorDescriptor_t invstd_desc, + const void *invstd, + const mluOpTensorDescriptor_t filter_desc, + const void *filter, + const mluOpTensorDescriptor_t bias_desc, + const void *bias, + const mluOpTensorDescriptor_t y_desc, + void *y); + +// Group:SyncBatchnormBackwardReduce +/*! + * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra + * workspace to optimize the sync_batchnorm_backward_reduce operation. + * + * The size of extra workspace is based on the given information of the + * sync_batchnorm_backward_reduce operation, including the input tensor descriptor \b desc_x. + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the mse_loss + * operation. For detailed information, see ::mluOpHandle_t. + * @param[in] desc_x + * The descriptor of the input tensor. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] workspace_size + * Pointer to the returned size of the extra workspace in bytes that is used in the + * sync_batchnorm_backward_reduce operation. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - None. + * + * @par Data Layout + * - None. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par note + * - This API is only used along with ::mluOpSyncBatchnormBackwardReduce_v2. + * - The ::mluOpSyncBatchnormBackwardReduce does not require this API. + * + * @par Example + * - None. + * + * @par Reference + * - None. + */ +mluOpStatus_t MLUOP_WIN_API +mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(mluOpHandle_t handle, + const mluOpTensorDescriptor_t desc_x, + size_t *workspace_size); + +// Group:SyncBatchnormBackwardReduce +/*! + * @brief Applies Syncronized Batch Normalization Reduce operator to backwardly compute grad + * filters, grad bias, sum_dy and sum_dy_xmu on each MLU device. + * + * Batch Normalization is used in convolution network, including but not limited to + * ResNet (Deep Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features). + * + * Compared with ::mluOpSyncBatchnormBackwardReduce, this function allows you to allocate some extra + * workspace as an input parameter. If you just set \b workspace to NULL and \b workspace_size to 0, + * this function will perform as same as ::mluOpSyncBatchnormBackwardReduce. + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * sync_batchnorm_backward_reduce operation. For detailed information, see ::mluOpHandle_t. + * @param[in] desc_dz + * The descriptor of the input tensor \b dz. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] dz + * Pointer to the MLU memory that stores the tensor \b dz, which denotes the partial + * derivative of batch normalization forward output. + * @param[in] desc_x + * The descriptor of the input tensor \b x. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] x + * Pointer to the MLU memory that stores the input tensor \b x. + * @param[in] mean_desc + * The descriptor of \b mean tensor. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] mean + * Pointer to the MLU memory that stores the tensor \b mean, which denotes the average + * result of input \b x. + * @param[in] desc_invstd + * The descriptor of \b invstd tensor. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] invstd + * Pointer to the MLU memory that stores the tensor \b invstd, which denotes the inversed + * standard deviation of input \b x. + * @param[in] workspace + * Pointer to the MLU memory that is used as an extra workspace for the + * ::mluOpSyncBatchnormBackwardReduce_v2. + * @param[in] workspace_size + * The size of the extra workspace in bytes that needs to be used in + * the ::mluOpSyncBatchnormBackwardReduce_v2. You can get the size of the workspace with + * the ::mluOpGetSyncBatchnormBackwardReduceWorkspaceSize function. + * @param[out] desc_dfilters + * The descriptor of \b dfilters tensor. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[out] dfilters + * Pointer to the MLU memory that stores the input tensor \b dfilters, which denotes + * partial derivative of filter in sync batch normalization forward training. It will be computed + * only if booleanvariable \b needs_input_grad1 is true. + * @param[out] desc_dbias + * The descriptor of the sync batch normalization output tensor \b dbias. For detailed + * information, see ::mluOpTensorDescriptor_t. + * @param[out] dbias + * Pointer to the MLU memory that stores the output tensor \b dbias, which denotes partial + * derivative of bias in sync batch normalization forward training. It will be computed + * only if \b needs_input_grad2 is true. + * @param[out] desc_sum_dy + * The descriptor of the sync batch normalization output tensor \b sum_dy. For detailed + * information, see ::mluOpTensorDescriptor_t. + * @param[out] sum_dy + * Pointer to the MLU memory that stores the output tensor \b sum_dy, which denotes the + * summation of dz and is also an intermediate variable to compute the partial derivative of + * input x. Moreover, it will be computed only if boolean variable \b needs_input_grad0 is true. + * @param[out] desc_sum_dy_xmu + * The descriptor of the sync batch normalization output tensor \b sum_dy_xmu. For detailed + * information, see ::mluOpTensorDescriptor_t. + * @param[out] sum_dy_xmu + * Pointer to the MLU memory that stores the output tensor \b sum_dy_xmu, which denotes + * sum{dz(x-mean)}. It is also an intermediate variable to compute the partial derivative of + * input \b x. Moreover, it will be computed only if boolean variable \b needs_input_grad0 is + * true. + * @param[in] needs_input_grad0 + * A boolean variable that determines whether to compute \b sum_dy and \b sum_dy_xmu. + * When \b needs_input_grad0 is true, \b sum_dy and \b sum_dy_xmu will be computed. + * When \b needs_input_grad0 is false, \b sum_dy and \b sum_dy_xmu will be NULL. + * @param[in] needs_input_grad1 + * A boolean variable that determines whether to compute \b dfilters. + * When \b needs_input_grad1 is true, \b dfilters will be computed. + * When \b needs_input_grad1 is false, \b dfilter will be NULL. + * @param[in] needs_input_grad2 + * A boolean variable that determines whether to compute \b dbias. + * When \b needs_input_grad2 is true, \b dbias will be computed. + * When \b needs_input_grad2 is false, \b dbias will be NULL. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - The supported combinations of data types are shown below with the following order: + * - dz_tensor - x_tensor - mean_tensor - invstd_tensor - dfilter_tensor - dbias_tensor - + * sum_dy_tensor - sum_dy_xmu_tensor + * - float - float - float - float - float - float - float - float. + * - half - half - float - float - float - float - float - float. + * + * @par Data Layout + * - The supported data layout of \b dz, \b x, \b mean, \b invstd, \b dfilter, \b dbias, \b sum_dy + * and \b sum_dy_xmu are as follows: + * - dz tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC. + * - x tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC. + * - mean tensor: \p MLUOP_LAYOUT_ARRAY. + * - invstd tensor: \p MLUOP_LAYOUT_ARRAY. + * - dfilter tensor: \p MLUOP_LAYOUT_ARRAY. + * - dbias tensor: \p MLUOP_LAYOUT_ARRAY. + * - sum_dy tensor: \p MLUOP_LAYOUT_ARRAY. + * - sum_dy_xmu tensor: \p MLUOP_LAYOUT_ARRAY. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - Before calling this function to perform ::mluOpSyncBatchnormBackwardReduce_v2, you need to get + * the size of workspace by ::mluOpGetSyncBatchnormBackwardReduceWorkspaceSize. + * + * @par note + * - The \b mean, \b invstd, \b dfilter, \b bias, \b sum_dy and \b sum_dy_xmu must be 1D tensors + * and the length of the dimensions of these tensors should be the same as the the length of + * the lowest dimension of \b x. + * - The length of each dimension of \b x and \b dz must be the same. + * + * @par Example + * - The example of the sync batchnorm element operation is as follows: + @verbatim + input four arrays by 1 * 2 * 3 * 2, 2, 2, 2 and 2 + --> dz: [[[[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]], + [[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]]]] + + --> x: [[[[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]], + [[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]]]] + + --> mean: [1, 1] + + --> invstd: [0.8, 0.8] + + output array by 2 + --> dfilter: [57.6, 57.6] + + --> dbias: [36.0, 36.0] + + --> sum_dy: [36.0, 36.0] + + --> sum_dy_xmu: [72.0, 72.0] + @endverbatim + * + * @par Reference + * - Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, + * Sergey Ioffe, 2015. + * + */ +mluOpStatus_t MLUOP_WIN_API +mluOpSyncBatchnormBackwardReduce_v2(mluOpHandle_t handle, + const mluOpTensorDescriptor_t desc_dz, + const void *dz, + const mluOpTensorDescriptor_t desc_x, + const void *x, + const mluOpTensorDescriptor_t desc_mean, + const void *mean, + const mluOpTensorDescriptor_t desc_invstd, + const void *invstd, + void *workspace, + size_t workspace_size, + const mluOpTensorDescriptor_t desc_dfilter, + void *dfilter, + const mluOpTensorDescriptor_t desc_dbias, + void *dbias, + const mluOpTensorDescriptor_t desc_sum_dy, + void *sum_dy, + const mluOpTensorDescriptor_t desc_sum_dy_xmu, + void *sum_dy_xmu, + const bool needs_input_grad0, + const bool needs_input_grad1, + const bool needs_input_grad2); + +// Group:SyncBatchnormBackwardReduce +/*! + * @brief Applies Syncronized Batch Normalization Reduce operator to backwardly compute grad filters, + * grad bias, sum_dy and sum_dy_xmu on each MLU device. + * + * Batch Normalization is used in CNN, including but not limited to + * ResNet (Deep Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features). + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * sync_batchnorm_backward_reduce operation. For detailed information, see ::mluOpHandle_t. + * @param[in] desc_dz + * The descriptor of the input tensor \b dz. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] dz + * Pointer to the MLU memory that stores the tensor \b dz, which denotes the partial derivative of + * batch normalization forward output. + * @param[in] desc_x + * The descriptor of the input tensor \b x. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] x + * Pointer to the MLU memory that stores the input tensor \b x. + * @param[in] mean_desc + * The descriptor of \b mean tensor. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] mean + * Pointer to the MLU memory that stores the tensor \b mean, which denotes the average result of + * input \b x. + * @param[in] desc_invstd + * The descriptor of \b invstd tensor. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] invstd + * Pointer to the MLU memory that stores the tensor \b invstd, which denotes the inversed standard deviation + * of input \b x. + * @param[out] desc_dfilter + * The descriptor of \b dfilter tensor. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[out] dfilter + * Pointer to the MLU memory that stores the input tensor \b dfilter, which denotes partial derivative + * of filter in sync batch normalization forward training. It will be computed only if boolean variable + * \b needs_input_grad1 is true. + * @param[out] desc_dbias + * The descriptor of the sync batch normalization output tensor \b dbias. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] dbias + * Pointer to the MLU memory that stores the output tensor \b dbias, which denotes partial derivative of + * bias in sync batch normalization forward training. It will be computed only if \b needs_input_grad2 is true. + * @param[out] desc_sum_dy + * The descriptor of the sync batch normalization output tensor \b sum_dy. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] sum_dy + * Pointer to the MLU memory that stores the output tensor \b sum_dy, which denotes the summation of dz + * and is also an intermediate variable to compute the partial derivative of input x. Moreover, it will be + * computed only if boolean variable \b needs_input_grad0 is true. + * @param[out] desc_sum_dy_xmu + * The descriptor of the sync batch normalization output tensor \b sum_dy_xmu. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] sum_dy_xmu + * Pointer to the MLU memory that stores the output tensor \b sum_dy_xmu, which denotes sum{dz(x-mean)}. + * It is also an intermediate variable to compute the partial derivative of + * input \b x. Moreover, it will be computed only if boolean variable \b needs_input_grad0 is true. + * @param[in] needs_input_grad0 + * A boolean variable that determines whether to compute \b sum_dy and \b sum_dy_xmu. + * When \b needs_input_grad0 is true, \b sum_dy and \b sum_dy_xmu will be computed. + * When \b needs_input_grad0 is false, \b sum_dy and \b sum_dy_xmu will be NULL. + * @param[in] needs_input_grad1 + * A boolean variable that determines whether to compute \b dfilters. + * When \b needs_input_grad1 is true, \b dfilters will be computed. + * When \b needs_input_grad1 is false, \b dfilter will be NULL. + * @param[in] needs_input_grad2 + * A boolean variable that determines whether to compute \b dbias. + * When \b needs_input_grad2 is true, \b dbias will be computed. + * When \b needs_input_grad2 is false, \b dbias will be NULL. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - The supported combinations of data types are shown below with the following order: + * - dz_tensor - x_tensor - mean_tensor - invstd_tensor - dfilter_tensor - dbias_tensor - sum_dy_tensor + * - sum_dy_xmu_tensor + * - float - float - float - float - float - float - float - float. + * - half - half - float - float - float - float - float - float. + * + * @par Data Layout + * - The supported data layout of \b dz, \b x, \b mean, \b invstd, \b dfilter, \b dbias, \b sum_dy and + * \b sum_dy_xmu are as follows: + * - dz tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC. + * - x tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC. + * - mean tensor: \p MLUOP_LAYOUT_ARRAY. + * - invstd tensor: \p MLUOP_LAYOUT_ARRAY. + * - dfilter tensor: \p MLUOP_LAYOUT_ARRAY. + * - dbias tensor: \p MLUOP_LAYOUT_ARRAY. + * - sum_dy tensor: \p MLUOP_LAYOUT_ARRAY. + * - sum_dy_xmu tensor: \p MLUOP_LAYOUT_ARRAY. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par note + * - The \b mean, \b invstd, \b dfilter, \b bias, \b sum_dy and \b sum_dy_xmu must be 1D tensors and the + * length of the dimensions of these tensors should be the same as the the length of the lowest dimension of \b x. + * - The length of each dimension of \b x and \b dz must be the same. + * + * @par Example + * - The example of the sync batchnorm element operation is as follows: + @verbatim + input four arrays by 1 * 2 * 3 * 2, 2, 2, 2 and 2 + --> dz: [[[[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]], + [[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]]]] + + --> x: [[[[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]], + [[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]]]] + + --> mean: [1, 1] + + --> invstd: [0.8, 0.8] + + output array by 2 + --> dfilter: [57.6, 57.6] + + --> dbias: [36.0, 36.0] + + --> sum_dy: [36.0, 36.0] + + --> sum_dy_xmu: [72.0, 72.0] + @endverbatim + * + * @par Reference + * - Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, + * Sergey Ioffe, 2015. + * + */ +mluOpStatus_t MLUOP_WIN_API +mluOpSyncBatchnormBackwardReduce(mluOpHandle_t handle, + const mluOpTensorDescriptor_t desc_dz, + const void *dz, + const mluOpTensorDescriptor_t desc_x, + const void *x, + const mluOpTensorDescriptor_t desc_mean, + const void *mean, + const mluOpTensorDescriptor_t desc_invstd, + const void *invstd, + const mluOpTensorDescriptor_t desc_dfilter, + void *dfilter, + const mluOpTensorDescriptor_t desc_dbias, + void *dbias, + const mluOpTensorDescriptor_t desc_sum_dy, + void *sum_dy, + const mluOpTensorDescriptor_t desc_sum_dy_xmu, + void *sum_dy_xmu, + const bool needs_input_grad0, + const bool needs_input_grad1, + const bool needs_input_grad2); + +// Group:SyncBatchNormBackwardElemt +/*! + * @brief Computes the gradients of input in the training scenario. + * + * This function is used in artificial intelligence, including but not limited + * to ResNet (Deep Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features). + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * syncBatchNormBackwardElemt operation. For detailed information, see ::mluOpHandle_t. + * @param[in] diff_y_desc + * The descriptor of the backpropagated differential tensor \b diff_y. For + * detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] diff_y + * Pointer to the MLU memory that stores the backpropagated differential tensor. + * @param[in] x_desc + * The descriptor of the input tensor \b x. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] x + * Pointer to the MLU memory that stores the input tensor. + * @param[in] mean_desc + * The descriptor of the input tensor \b mean. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] mean + * Pointer to the MLU memory that stores the global mean. + * @param[in] invstd_desc + * The descriptor of the input tensor \b invstd. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] invstd + * Pointer to the MLU memory that stores the global inverse standard deviation. + * @param[in] filter_desc + * The descriptor of the input tensor \b filter. For detailed information, see + * ::mluOpTensorDescriptor_t. The descriptor can be NULL when \b filter pointer is NULL. + * @param[in] filter + * Pointer to the MLU memory that stores the input tensor \b filter for affine + * transformation after batch normilization. The value of this pointer can be NULL. + * @param[in] mean_dy_desc + * The descriptor of the input tensor \b mean_dy. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] mean_dy + * Pointer to the MLU memory that stores the mean of diff_y. + * @param[in] mean_dy_xmu_desc + * The descriptor of the input tensor \b mean_dy_xmu. For detailed information, + * see ::mluOpTensorDescriptor_t. + * @param[in] mean_dy_xmu + * Pointer to the MLU memory that stores the mean of the result of diff_y * (x - mean). + * @param[in] diff_x_desc + * The descriptor of the output tensor \b diff_x. For detailed information, + * see ::mluOpTensorDescriptor_t. + * @param[out] diff_x + * Pointer to the MLU memory that stores the derivative of input. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - The supported combinations of data types are shown below: + * - float(\b diff_y) - float(\b x) - float(\b mean) - float(\b invstd) - float(\b filter) - + * float(\b mean_dy) - float(\b mean_dy_xmu) - float(\b diff_x). + * - half(\b diff_y) - half(\b x) - float(\b mean) - float(\b invstd) - float(\b filter) - + * float(\b mean_dy) - float(\b mean_dy_xmu) - half(\b diff_x). + * + * @par Data Layout + * - The supported data layout of \b diff_y, \b x, \b mean, \b invstd, \b filter, \b mean_dy, + * \b mean_dy_xmu and \b diff_x are as follows: + * - diff_y tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and + * \p MLUOP_LAYOUT_NLC. + * - x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC. + * - mean tensor: \p MLUOP_LAYOUT_ARRAY. + * - invstd tensor: \p MLUOP_LAYOUT_ARRAY. + * - filter tensor: \p MLUOP_LAYOUT_ARRAY. + * - mean_dy tensor: \p MLUOP_LAYOUT_ARRAY. + * - mean_dy_xmu tensor: \p MLUOP_LAYOUT_ARRAY. + * - diff_x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and + * \p MLUOP_LAYOUT_NLC. + * - The layouts of the \b diff_x \b x and \b diff_y should be the same. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par note + * - The \b mean, \b invstd, \b filter, \b mean_dy and \b mean_dy_xmu must be 1D tensors and the + * length of the dimension of these tensors should be the same as the the length of the lowest + * dimension of \b x. + * - The length of each dimension of \b diff_y, \b x and \b diff_x must be the same. + * + * @par Example + * - The example of the sync_batch_norm_backward_elemt operation is as follows: + @verbatim + input seven arrays by 1, 1, 1, 1, 1, 1, 1 and 1 + --> diff_y: [[[[1.0]]]] + --> x: [[[[2.0]]]] + --> mean: [3.0] + --> invstd: [4.0] + --> filter: [5.0] + --> mean_dy: [6.0] + --> mean_dy_xmu: [7.0] + + output an array by 1 + --> mean: [[[[-8960.0]]]] + @endverbatim + * + * @par Reference + * - https://pytorch.org/docs/1.6.0/jit_builtin_functions.html?highlight=batch_norm_backward_elemt + * + */ +mluOpStatus_t MLUOP_WIN_API +mluOpSyncBatchNormBackwardElemt(mluOpHandle_t handle, + const mluOpTensorDescriptor_t diff_y_desc, + const void *diff_y, + const mluOpTensorDescriptor_t x_desc, + const void *x, + const mluOpTensorDescriptor_t mean_desc, + const void *mean, + const mluOpTensorDescriptor_t invstd_desc, + const void *invstd, + const mluOpTensorDescriptor_t filter_desc, + const void *filter, + const mluOpTensorDescriptor_t mean_dy_desc, + const void *mean_dy, + const mluOpTensorDescriptor_t mean_dy_xmu_desc, + const void *mean_dy_xmu, + const mluOpTensorDescriptor_t diff_x_desc, + void *diff_x); + +// Group:SyncBatchNormBackwardElemt +/*! + * @brief Computes the gradients of input in the training scenario. + * + * This function is used in ResNet (Deep Residual Network), Yolo (You Only Look Once) and + * R-CNN (Regions with CNN features). + * + * Compared with ::mluOpSyncBatchNormBackwardElemt, this function first computes the intermediate + * results mean_dy and mean_dy_xmu based on \b sum_dy, \b sum_dy_xmu and \b count, and then + * computes the gradient of \b x with the intermediate results. + * + * @param[in] handle + * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * syncBatchNormBackwardElemt operation. For detailed information, see ::mluOpHandle_t. + * @param[in] diff_y_desc + * The descriptor of the backpropagated differential tensor \b diff_y. For + * detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] diff_y + * Pointer to the MLU memory that stores the backpropagated differential tensor. + * @param[in] x_desc + * The descriptor of the input tensor \b x. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] x + * Pointer to the MLU memory that stores the input tensor. + * @param[in] mean_desc + * The descriptor of the input tensor \b mean. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] mean + * Pointer to the MLU memory that stores the global mean. + * @param[in] invstd_desc + * The descriptor of the input tensor \b invstd. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] invstd + * Pointer to the MLU memory that stores the global inverse standard deviation. + * @param[in] filter_desc + * The descriptor of the input tensor \b filter. For detailed information, see + * ::mluOpTensorDescriptor_t. The descriptor can be NULL when \b filter pointer is NULL. + * @param[in] filter + * Pointer to the MLU memory that stores the input tensor \b filter for affine + * transformation after batch normalization. The value of this pointer can be NULL. + * @param[in] sum_dy_desc + * The descriptor of the input tensor \b sum_dy. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] sum_dy + * Pointer to the MLU memory that stores the sum of diff_y. + * @param[in] sum_dy_xmu_desc + * The descriptor of the input tensor \b sum_dy_xmu. For detailed information, + * see ::mluOpTensorDescriptor_t. + * @param[in] sum_dy_xmu + * Pointer to the MLU memory that stores the sum of the result of diff_y * (x - mean). + * @param[in] count_desc + * The descriptor of the input tensor \b count. For detailed information, + * see ::mluOpTensorDescriptor_t. + * @param[in] count + * Pointer to the MLU memory that stores the number of the high dimensions (the dimensions + * except the lowest dimension) of the input tensor \b x on all MLU devices. + * @param[in] diff_x_desc + * The descriptor of the output tensor \b diff_x. + * @param[out] diff_x + * Pointer to the MLU memory that stores the derivative of input. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM. + * + * @par Data Type + * - The supported combinations of data types are shown below: + * - float(\b diff_y) - float(\b x) - float(\b mean) - float(\b invstd) - float(\b filter) - + * float(\b sum_dy) - float(\b sum_dy_xmu) - int32_t(\b count) - float(\b diff_x). + * - half(\b diff_y) - half(\b x) - float(\b mean) - float(\b invstd) - float(\b filter) - + * float(\b sum_dy) - float(\b sum_dy_xmu) - int32_t(\b count) - half(\b diff_x). + * + * @par Data Layout + * - The supported data layouts of \b diff_y, \b x, \b mean, \b invstd, \b filter, \b sum_dy, + * \b sum_dy_xmu and \b diff_x are as follows: + * - diff_y tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and + * \p MLUOP_LAYOUT_NLC. + * - x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC. + * - mean tensor: \p MLUOP_LAYOUT_ARRAY. + * - invstd tensor: \p MLUOP_LAYOUT_ARRAY. + * - filter tensor: \p MLUOP_LAYOUT_ARRAY. + * - sum_dy tensor: \p MLUOP_LAYOUT_ARRAY. + * - sum_dy_xmu tensor: \p MLUOP_LAYOUT_ARRAY. + * - diff_x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and + * \p MLUOP_LAYOUT_NLC. + * - The layouts of the \b diff_x \b x and \b diff_y should be the same. + * + * @par Scale Limitation + * - None. + * + * @par API Dependency + * - None. + * + * @par note + * - The \b mean, \b invstd, \b filter, \b sum_dy and \b sum_dy_xmu must be 1D tensors and the + * length of the dimension of these tensors should be the same as the the length of the lowest + * dimension of \b x. + * - The length of each dimension of \b diff_y, \b x and \b diff_x must be the same. + * + * @par Example + * - The example of the sync_batchnorm_backward_elemt_v2 operation is as follows: + @verbatim + input seven arrays by 1, 1, 1, 1, 1, 1, 1 and 1 + --> diff_y: [[[[1.0]]]] + --> x: [[[[2.0]]]] + --> mean: [3.0] + --> invstd: [4.0] + --> filter: [5.0] + --> sum_dy: [6.0] + --> sum_dy_xmu: [7.0] + --> count: [1] + + output an array by 1 + --> mean: [[[[-8960.0]]]] + @endverbatim + * + * @par Reference + * - https://pytorch.org/docs/1.11.0/jit_builtin_functions.html?highlight=batch_norm_backward_elemt + * + */ +mluOpStatus_t MLUOP_WIN_API +mluOpSyncBatchNormBackwardElemtV2(mluOpHandle_t handle, + const mluOpTensorDescriptor_t diff_y_desc, + const void *diff_y, + const mluOpTensorDescriptor_t x_desc, + const void *x, + const mluOpTensorDescriptor_t mean_desc, + const void *mean, + const mluOpTensorDescriptor_t invstd_desc, + const void *invstd, + const mluOpTensorDescriptor_t filter_desc, + const void *filter, + const mluOpTensorDescriptor_t sum_dy_desc, + const void *sum_dy, + const mluOpTensorDescriptor_t sum_dy_xmu_desc, + const void *sum_dy_xmu, + const mluOpTensorDescriptor_t count_desc, + const void *count, + const mluOpTensorDescriptor_t diff_x_desc, + void *diff_x); #if defined(__cplusplus) } #endif diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto b/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto index 52e5ca5754..c2bd2338c6 160000 --- a/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto @@ -1 +1 @@ -Subproject commit 52e5ca57549553dded7687b7a0762caac7ad39d6 +Subproject commit c2bd2338c67ccb4e98968563315ba27950ce68e7 diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp new file mode 100644 index 0000000000..0196c5b794 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp @@ -0,0 +1,150 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "sync_batch_norm_backward_elemt.h" + +#include + +namespace mluoptest { + +void cpuSyncBatchNormBackwardElemt(const float *x, const float *diff_y, + const float *weight, const float *mean, + const float *invstd, const float *mean_dy, + const float *mean_dy_xmu, float *diff_x, + const int len_x, const int len_c) { + int len_nhw = len_x / len_c; + for (int ci = 0; ci < len_c; ++ci) { + for (int i = 0; i < len_nhw; ++i) { + if (weight == nullptr) { + diff_x[i * len_c + ci] = (diff_y[i * len_c + ci] - mean_dy[ci] - + (x[i * len_c + ci] - mean[ci]) * invstd[ci] * + invstd[ci] * mean_dy_xmu[ci]) * + invstd[ci]; + } else { + diff_x[i * len_c + ci] = (diff_y[i * len_c + ci] - mean_dy[ci] - + (x[i * len_c + ci] - mean[ci]) * invstd[ci] * + invstd[ci] * mean_dy_xmu[ci]) * + weight[ci] * invstd[ci]; + } + } + } +} + +void SyncBatchNormBackwardElemtExecutor::paramCheck() { + if (parser_->getInputNum() != 6 && parser_->getInputNum() != 7) { + LOG(ERROR) << "SyncBatchNormBackwardElemtExecutor: input number is wrong. "; + } + if (parser_->getOutputNum() != 1) { + LOG(ERROR) + << "SyncBatchNormBackwardElemtExecutor: output number is wrong. "; + } +} + +void SyncBatchNormBackwardElemtExecutor::compute() { + mluOpTensorDescriptor_t x_desc, diff_y_desc, diff_x_desc; + mluOpTensorDescriptor_t mean_desc, invstd_desc, weight_desc, mean_dy_desc, + mean_dy_xmu_desc; + + diff_y_desc = tensor_desc_[0].tensor; + x_desc = tensor_desc_[1].tensor; + mean_desc = tensor_desc_[2].tensor; + invstd_desc = tensor_desc_[3].tensor; + if (parser_->getInputNum() == 7) { + weight_desc = tensor_desc_[4].tensor; + mean_dy_desc = tensor_desc_[5].tensor; + mean_dy_xmu_desc = tensor_desc_[6].tensor; + diff_x_desc = tensor_desc_[7].tensor; + } else { + weight_desc = nullptr; + mean_dy_desc = tensor_desc_[4].tensor; + mean_dy_xmu_desc = tensor_desc_[5].tensor; + diff_x_desc = tensor_desc_[6].tensor; + } + + void *dev_diff_y = data_vector_[0].device_ptr; + void *dev_x = data_vector_[1].device_ptr; + void *dev_mean = data_vector_[2].device_ptr; + void *dev_invstd = data_vector_[3].device_ptr; + void *dev_weight = nullptr; + void *dev_mean_dy = nullptr; + void *dev_mean_dy_xmu = nullptr; + void *dev_diff_x = nullptr; + if (parser_->getInputNum() == 7) { + dev_weight = data_vector_[4].device_ptr; + dev_mean_dy = data_vector_[5].device_ptr; + dev_mean_dy_xmu = data_vector_[6].device_ptr; + dev_diff_x = data_vector_[7].device_ptr; + } else { + dev_mean_dy = data_vector_[4].device_ptr; + dev_mean_dy_xmu = data_vector_[5].device_ptr; + dev_diff_x = data_vector_[6].device_ptr; + } + + VLOG(4) << "Start to run mluOpSyncBatchNormBackwardElemt()."; + interface_timer_.start(); + MLUOP_CHECK(mluOpSyncBatchNormBackwardElemt( + handle_, diff_y_desc, dev_diff_y, x_desc, dev_x, mean_desc, dev_mean, + invstd_desc, dev_invstd, weight_desc, dev_weight, mean_dy_desc, + dev_mean_dy, mean_dy_xmu_desc, dev_mean_dy_xmu, diff_x_desc, dev_diff_x)); + interface_timer_.stop(); + VLOG(4) << "mluOpSyncBatchNormBackwardElemt() end"; +} + +void SyncBatchNormBackwardElemtExecutor::cpuCompute() { + int len_x = parser_->getInputDataCount(0); + int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + + if (len_x == 0 || len_c == 0) { + VLOG(4) << "SyncBatchNormBackwardElemtExecutor: cpu compute zero elemt"; + return; + } + + VLOG(4) << "Start to run cpuSyncBatchNormBackwardElemt()."; + + float *cpu_diff_y = cpu_fp32_input_[0]; + float *cpu_x = cpu_fp32_input_[1]; + float *cpu_mean = cpu_fp32_input_[2]; + float *cpu_invstd = cpu_fp32_input_[3]; + float *cpu_weight = nullptr; + float *cpu_mean_dy = nullptr; + float *cpu_mean_dy_xmu = nullptr; + float *cpu_diff_x = cpu_fp32_output_[0]; + if (parser_->getInputNum() == 7) { + cpu_weight = cpu_fp32_input_[4]; + cpu_mean_dy = cpu_fp32_input_[5]; + cpu_mean_dy_xmu = cpu_fp32_input_[6]; + } else { + cpu_mean_dy = cpu_fp32_input_[4]; + cpu_mean_dy_xmu = cpu_fp32_input_[5]; + } + + cpuSyncBatchNormBackwardElemt(cpu_x, cpu_diff_y, cpu_weight, cpu_mean, + cpu_invstd, cpu_mean_dy, cpu_mean_dy_xmu, + cpu_diff_x, len_x, len_c); + VLOG(4) << "cpuSyncBatchNormBackwardElemt() end"; +} + +int64_t SyncBatchNormBackwardElemtExecutor::getTheoryOps() { + int64_t theory_ops = 0; + int len_x = parser_->getInputDataCount(0); + int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + if (parser_->getInputNum() == 7) { + theory_ops = 5 * len_x + 3 * len_c; + } else { + theory_ops = 5 * len_x + 2 * len_c; + } + + VLOG(4) << "SyncBatchNormBackwardElemtExecutor: getTheoryOps: " << theory_ops + << " ops"; + return theory_ops; +} + +} // namespace mluoptest diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.h new file mode 100644 index 0000000000..1a8327b775 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.h @@ -0,0 +1,36 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCH_NORM_BACKWARD_ELEMT_SYNC_\ +BATCH_NORM_BACKWARD_ELEMT_H_ +#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCH_NORM_BACKWARD_ELEMT_SYNC_\ +BATCH_NORM_BACKWARD_ELEMT_H_ + +#include "executor.h" + +namespace mluoptest { + +class SyncBatchNormBackwardElemtExecutor : public Executor { + public: + SyncBatchNormBackwardElemtExecutor() {} + ~SyncBatchNormBackwardElemtExecutor() {} + + void paramCheck(); + void compute(); + void cpuCompute(); + int64_t getTheoryOps() override; +}; + +} // namespace mluoptest + +#endif // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCH_NORM_BACKWARD_ELEMT_SYNC_\ +BATCH_NORM_BACKWARD_ELEMT_H_ diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/test_case/case_0.prototxt new file mode 100644 index 0000000000..1f6ac081d3 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/test_case/case_0.prototxt @@ -0,0 +1,124 @@ +op_name: "sync_batch_norm_backward_elemt" +op_type: "SYNC_BATCHNORM_BACKWARD_ELEMT" +input { + id: "diff_y" + shape: { + dims: 1 + dims: 10 + dims: 128 + dims: 128 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: -2.0 + lower_bound: 2.0 + distribution: UNIFORM + } +} +input { + id: "x" + shape: { + dims: 1 + dims: 10 + dims: 128 + dims: 128 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 243 + upper_bound: 2.0 + lower_bound: -2.0 + distribution: UNIFORM + } +} +input { + id: "mean" + shape: { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: 0.5 + lower_bound: -0.25 + distribution: UNIFORM + } +} +input { + id: "invstd" + shape: { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: 0.5 + lower_bound: -0.5 + distribution: UNIFORM + } +} +input { + id: "weight" + shape: { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: 1.0 + lower_bound: 0.5 + distribution: UNIFORM + } +} +input { + id: "mean_dy" + shape: { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: 1.0 + lower_bound: 0.5 + distribution: UNIFORM + } +} +input { + id: "mean_dy_xmu" + shape: { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: 1.0 + lower_bound: 0.5 + distribution: UNIFORM + } +} +output { + id: "diff_x" + shape: { + dims: 1 + dims: 10 + dims: 128 + dims: 128 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +} diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp new file mode 100644 index 0000000000..635a75f80b --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp @@ -0,0 +1,166 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "sync_batchnorm_backward_elemt_v2.h" + +namespace mluoptest { + +void cpuSyncBatchnormBackwardElemt(const float *diff_y, const float *x, + const float *mean, const float *invstd, + const float *weight, const float *sum_dy, + const float *sum_dy_xmu, const int32_t sum, + float *diff_x, const int len_x, + const int len_c) { + int len_nhw = len_x / len_c; + for (int ci = 0; ci < len_c; ++ci) { + float sum_dy_temp = sum_dy[ci] / sum; + float sum_dy_xmu_temp = sum_dy_xmu[ci] / sum; + for (int i = 0; i < len_nhw; ++i) { + if (weight == nullptr) { + diff_x[i * len_c + ci] = (diff_y[i * len_c + ci] - sum_dy_temp - + (x[i * len_c + ci] - mean[ci]) * invstd[ci] * + invstd[ci] * sum_dy_xmu_temp) * + invstd[ci]; + } else { + diff_x[i * len_c + ci] = (diff_y[i * len_c + ci] - sum_dy_temp - + (x[i * len_c + ci] - mean[ci]) * invstd[ci] * + invstd[ci] * sum_dy_xmu_temp) * + weight[ci] * invstd[ci]; + } + } + } +} + +void SyncBatchnormBackwardElemtV2Executor::paramCheck() { + if (parser_->getInputNum() != 7 && parser_->getInputNum() != 8) { + LOG(ERROR) + << "SyncBatchnormBackwardElemtV2Executor: input number is wrong. "; + } + if (parser_->getOutputNum() != 1) { + LOG(ERROR) + << "SyncBatchnormBackwardElemtV2Executor: output number is wrong. "; + } +} + +void SyncBatchnormBackwardElemtV2Executor::compute() { + mluOpTensorDescriptor_t x_desc, diff_y_desc, diff_x_desc, count_desc; + mluOpTensorDescriptor_t mean_desc, invstd_desc, weight_desc, sum_dy_desc, + sum_dy_xmu_desc; + + diff_y_desc = tensor_desc_[0].tensor; + x_desc = tensor_desc_[1].tensor; + mean_desc = tensor_desc_[2].tensor; + invstd_desc = tensor_desc_[3].tensor; + if (parser_->getInputNum() == 8) { + weight_desc = tensor_desc_[4].tensor; + sum_dy_desc = tensor_desc_[5].tensor; + sum_dy_xmu_desc = tensor_desc_[6].tensor; + count_desc = tensor_desc_[7].tensor; + diff_x_desc = tensor_desc_[8].tensor; + } else { + weight_desc = nullptr; + sum_dy_desc = tensor_desc_[4].tensor; + sum_dy_xmu_desc = tensor_desc_[5].tensor; + count_desc = tensor_desc_[6].tensor; + diff_x_desc = tensor_desc_[7].tensor; + } + + void *dev_diff_y = data_vector_[0].device_ptr; + void *dev_x = data_vector_[1].device_ptr; + void *dev_mean = data_vector_[2].device_ptr; + void *dev_invstd = data_vector_[3].device_ptr; + void *dev_weight = nullptr; + void *dev_sum_dy = nullptr; + void *dev_sum_dy_xmu = nullptr; + void *dev_count = nullptr; + void *dev_diff_x = nullptr; + if (parser_->getInputNum() == 8) { + dev_weight = data_vector_[4].device_ptr; + dev_sum_dy = data_vector_[5].device_ptr; + dev_sum_dy_xmu = data_vector_[6].device_ptr; + dev_count = data_vector_[7].device_ptr; + dev_diff_x = data_vector_[8].device_ptr; + } else { + dev_sum_dy = data_vector_[4].device_ptr; + dev_sum_dy_xmu = data_vector_[5].device_ptr; + dev_count = data_vector_[6].device_ptr; + dev_diff_x = data_vector_[7].device_ptr; + } + + VLOG(4) << "Start to run mluOpSyncBatchnormBackwardElemt_v2()."; + interface_timer_.start(); + MLUOP_CHECK(mluOpSyncBatchNormBackwardElemtV2( + handle_, diff_y_desc, dev_diff_y, x_desc, dev_x, mean_desc, dev_mean, + invstd_desc, dev_invstd, weight_desc, dev_weight, sum_dy_desc, dev_sum_dy, + sum_dy_xmu_desc, dev_sum_dy_xmu, count_desc, dev_count, diff_x_desc, + dev_diff_x)); + interface_timer_.stop(); + VLOG(4) << "mluOpSyncBatchnormBackwardElemt_v2() end"; +} + +void SyncBatchnormBackwardElemtV2Executor::cpuCompute() { + int len_x = parser_->getInputDataCount(0); + int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + int len_n = tensor_desc_[0].tensor->dims[0]; + + if (len_x == 0 || len_c == 0) { + VLOG(4) << "SyncBatchnormBackwardElemtV2Executor: cpu compute zero elemt"; + return; + } + + VLOG(4) << "Start to run cpuSyncBatchnormBackwardElemt()."; + + float *cpu_diff_y = cpu_fp32_input_[0]; + float *cpu_x = cpu_fp32_input_[1]; + float *cpu_mean = cpu_fp32_input_[2]; + float *cpu_invstd = cpu_fp32_input_[3]; + float *cpu_weight = nullptr; + float *cpu_sum_dy = nullptr; + float *cpu_sum_dy_xmu = nullptr; + float *cpu_count = nullptr; + float *cpu_diff_x = cpu_fp32_output_[0]; + if (parser_->getInputNum() == 8) { + cpu_weight = cpu_fp32_input_[4]; + cpu_sum_dy = cpu_fp32_input_[5]; + cpu_sum_dy_xmu = cpu_fp32_input_[6]; + cpu_count = cpu_fp32_input_[7]; + } else { + cpu_sum_dy = cpu_fp32_input_[4]; + cpu_sum_dy_xmu = cpu_fp32_input_[5]; + cpu_count = cpu_fp32_input_[6]; + } + int sum = 0; + for (int k = 0; k < len_n; k++) { + sum += (int32_t)(cpu_count[k]); + } + + cpuSyncBatchnormBackwardElemt(cpu_diff_y, cpu_x, cpu_mean, cpu_invstd, + cpu_weight, cpu_sum_dy, cpu_sum_dy_xmu, sum, + cpu_diff_x, len_x, len_c); + VLOG(4) << "cpuSyncBatchnormBackwardElemt() end"; +} + +int64_t SyncBatchnormBackwardElemtV2Executor::getTheoryOps() { + int64_t theory_ops = 0; + int len_x = parser_->getInputDataCount(0); + int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + if (parser_->getInputNum() == 7) { + theory_ops = 5 * len_x + 3 * len_c; + } else { + theory_ops = 5 * len_x + 2 * len_c; + } + + VLOG(4) << "SyncBatchnormBackwardElemtV2Executor: getTheoryOps: " + << theory_ops << " ops"; + return theory_ops; +} + +} // namespace mluoptest diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.h new file mode 100644 index 0000000000..7ff27b8a12 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.h @@ -0,0 +1,36 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_ELEMT_V2_\ +SYNC_BATCHNORM_BACKWARD_ELEMT_V2_H_ +#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_ELEMT_V2_\ +SYNC_BATCHNORM_BACKWARD_ELEMT_V2_H_ + +#include "executor.h" + +namespace mluoptest { + +class SyncBatchnormBackwardElemtV2Executor : public Executor { + public: + SyncBatchnormBackwardElemtV2Executor() {} + ~SyncBatchnormBackwardElemtV2Executor() {} + + void paramCheck(); + void compute(); + void cpuCompute(); + int64_t getTheoryOps() override; +}; + +} // namespace mluoptest + +#endif // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_ELEMT_V2_\ +SYNC_BATCHNORM_BACKWARD_ELEMT_V2_H_ diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/test_case/case_0.prototxt new file mode 100644 index 0000000000..b6a7115324 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/test_case/case_0.prototxt @@ -0,0 +1,138 @@ +op_name: "sync_batchnorm_backward_elemt_v2" +op_type: "SYNC_BATCHNORM_BACKWARD_ELEMT_V2" +input { + id: "diff_y" + shape: { + dims: 1 + dims: 10 + dims: 128 + dims: 128 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: -2.0 + lower_bound: 2.0 + distribution: UNIFORM + } +} +input { + id: "x" + shape: { + dims: 1 + dims: 10 + dims: 128 + dims: 128 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 243 + upper_bound: 2.0 + lower_bound: -2.0 + distribution: UNIFORM + } +} +input { + id: "mean" + shape: { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: 0.5 + lower_bound: -0.25 + distribution: UNIFORM + } +} +input { + id: "invstd" + shape: { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: 0.5 + lower_bound: -0.5 + distribution: UNIFORM + } +} +input { + id: "weight" + shape: { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: 1.0 + lower_bound: 0.5 + distribution: UNIFORM + } +} +input { + id: "sum_dy" + shape: { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: 1.0 + lower_bound: 0.5 + distribution: UNIFORM + } +} +input { + id: "sum_dy_xmu" + shape: { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 233 + upper_bound: 1.0 + lower_bound: 0.5 + distribution: UNIFORM + } +} +input { + id: "count" + shape: { + dims: 1 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_INT32 + random_data: { + seed: 233 + upper_bound: 22 + lower_bound: 2 + distribution: UNIFORM + } +} +output { + id: "diff_x" + shape: { + dims: 1 + dims: 10 + dims: 128 + dims: 128 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +} diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp new file mode 100644 index 0000000000..1571f556cb --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp @@ -0,0 +1,285 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "sync_batchnorm_backward_reduce.h" + +namespace mluoptest { + +void SyncBatchnormBackwardReduceExecutor::paramCheck() { + GTEST_CHECK(parser_->node()->has_sync_batchnorm_backward_reduce_param(), + "Lose sync_batchnorm_backward_reduce param."); +} + +void SyncBatchnormBackwardReduceExecutor::workspaceMalloc() { + auto tensor_x = tensor_desc_[1].tensor; + void *tmp = nullptr; + // allocate extra nram space for deletion of CDMA + MLUOP_CHECK(mluOpGetSyncBatchnormBackwardReduceWorkspaceSize( + handle_, tensor_x, &workspace_size_)); + if (workspace_size_ > 0) { + VLOG(4) << "Malloc workspace space for deletion of CDMA."; + tmp = mlu_runtime_.allocate(workspace_size_); + VLOG(4) << "Mallocated addr: " << tmp << ", size: " << workspace_size_; + } else { + VLOG(4) << "Don't need to Malloc workspace space."; + } + workspace_.push_back(tmp); + eva_->setMluWorkspaceSize(workspace_size_); +} + +void SyncBatchnormBackwardReduceExecutor::workspaceFree() { + if (workspace_[0]) { + VLOG(4) << "Free device workspace space."; + mlu_runtime_.deallocate(workspace_[0]); + } +} + +void SyncBatchnormBackwardReduceExecutor::compute() { + const bool needs_input_grad0 = parser_->getProtoNode() + ->sync_batchnorm_backward_reduce_param() + .needs_input_grad0(); + const bool needs_input_grad1 = parser_->getProtoNode() + ->sync_batchnorm_backward_reduce_param() + .needs_input_grad1(); + const bool needs_input_grad2 = parser_->getProtoNode() + ->sync_batchnorm_backward_reduce_param() + .needs_input_grad2(); + // input tensor description + mluOpTensorDescriptor_t desc_dz = tensor_desc_[0].tensor; + mluOpTensorDescriptor_t desc_x = tensor_desc_[1].tensor; + mluOpTensorDescriptor_t desc_mean = tensor_desc_[2].tensor; + mluOpTensorDescriptor_t desc_invstd = tensor_desc_[3].tensor; + mluOpTensorDescriptor_t desc_sum_dy = NULL; + mluOpTensorDescriptor_t desc_sum_dy_xmu = NULL; + mluOpTensorDescriptor_t desc_dweight = NULL; + mluOpTensorDescriptor_t desc_dbias = NULL; + + if (needs_input_grad0 == 1 && needs_input_grad1 == 0 && + needs_input_grad2 == 0) { + GTEST_CHECK(parser_->outputs().size() == 2, + "[Output MISMATCHED]: Only sum_dy and sum_dy_xmu will be " + "compute currently."); + } + if (needs_input_grad0 == 0 && needs_input_grad1 == 1 && + needs_input_grad2 == 0) { + GTEST_CHECK(parser_->outputs().size() == 1, + "[Output MISMATCHED]: Only dweight will be compute currently."); + } + if (needs_input_grad0 == 0 && needs_input_grad1 == 0 && + needs_input_grad2 == 1) { + GTEST_CHECK(parser_->outputs().size() == 1, + "[Output MISMATCHED]: Only dbias will be compute currently."); + } + if (needs_input_grad0 == 1 && needs_input_grad1 == 1 && + needs_input_grad2 == 0) { + GTEST_CHECK(parser_->outputs().size() == 3, + "[Output MISMATCHED]: Only sum_dy, sum_dy_xmu, dweight will be " + "compute currently."); + } + if (needs_input_grad0 == 1 && needs_input_grad1 == 0 && + needs_input_grad2 == 1) { + GTEST_CHECK(parser_->outputs().size() == 3, + "[Output MISMATCHED]: Only sum_dy, sum_dy_xmu, dbias will be " + "compute currently."); + } + if (needs_input_grad0 == 0 && needs_input_grad1 == 1 && + needs_input_grad2 == 1) { + GTEST_CHECK(parser_->outputs().size() == 2, + "[Output MISMATCHED]: Only dweight and dbias will be compute " + "currently."); + } + if (needs_input_grad0 == 1 && needs_input_grad1 == 1 && + needs_input_grad2 == 1) { + GTEST_CHECK(parser_->outputs().size() == 4, + "[Output MISMATCHED]: All of the four outputs will be compute " + "currently."); + } + // input pointer for device + void *dev_dz = data_vector_[0].device_ptr; + void *dev_x = data_vector_[1].device_ptr; + void *dev_mean = data_vector_[2].device_ptr; + void *dev_invstd = data_vector_[3].device_ptr; + void *dev_sum_dy = NULL; + void *dev_sum_dy_xmu = NULL; + void *dev_dweight = NULL; + void *dev_dbias = NULL; + + if (needs_input_grad0) { + desc_sum_dy = tensor_desc_[5].tensor; + desc_sum_dy_xmu = tensor_desc_[6].tensor; + dev_sum_dy = data_vector_[5].device_ptr; + dev_sum_dy_xmu = data_vector_[6].device_ptr; + if (needs_input_grad1) { + desc_dweight = tensor_desc_[7].tensor; + dev_dweight = data_vector_[7].device_ptr; + if (needs_input_grad2) { + desc_dbias = tensor_desc_[8].tensor; + dev_dbias = data_vector_[8].device_ptr; + } + } else { + if (needs_input_grad2) { + desc_dbias = tensor_desc_[7].tensor; + dev_dbias = data_vector_[7].device_ptr; + } + } + } else { + if (needs_input_grad1) { + desc_dweight = tensor_desc_[5].tensor; + dev_dweight = data_vector_[5].device_ptr; + if (needs_input_grad2) { + desc_dbias = tensor_desc_[6].tensor; + dev_dbias = data_vector_[6].device_ptr; + } + } else { + if (needs_input_grad2) { + desc_dbias = tensor_desc_[5].tensor; + dev_dbias = data_vector_[5].device_ptr; + } + } + } + + VLOG(4) << "Start to run mluOpSyncBatchNormBackwardReduce()."; + interface_timer_.start(); +#if 1 + VLOG(4) << "launch mluOpSyncBatchnormBackwardReduce_v2."; + MLUOP_CHECK(mluOpSyncBatchnormBackwardReduce_v2( + handle_, desc_dz, dev_dz, desc_x, dev_x, desc_mean, dev_mean, desc_invstd, + dev_invstd, workspace_[0], workspace_size_, desc_dweight, dev_dweight, + desc_dbias, dev_dbias, desc_sum_dy, dev_sum_dy, desc_sum_dy_xmu, + dev_sum_dy_xmu, needs_input_grad0, needs_input_grad1, needs_input_grad2)); +#else + VLOG(4) << "launch mluOpSyncBatchnormBackwardReduce."; + MLUOP_CHECK(mluOpSyncBatchnormBackwardReduce( + handle_, desc_dz, dev_dz, desc_x, dev_x, desc_mean, dev_mean, desc_invstd, + dev_invstd, desc_dweight, dev_dweight, desc_dbias, dev_dbias, desc_sum_dy, + dev_sum_dy, desc_sum_dy_xmu, dev_sum_dy_xmu, needs_input_grad0, + needs_input_grad1, needs_input_grad2)); +#endif + + interface_timer_.stop(); +} + +void cpuGetSyncBnBkwReduceOuput( + const float *x, const float *diff_z, const float *mean, const float *invstd, + float *diff_weight, float *diff_bias, float *sum_dy, float *sum_dy_xmu, + const int len_x, const int len_c, const bool needs_input_grad0, + const bool needs_input_grad1, const bool needs_input_grad2) { + if (len_x == 0 || len_c == 0) { + LOG(ERROR) << "SyncBnBackwardReduce: the element number of input tensor " + "should not be zero"; + return; + } + int len_nhw = len_x / len_c; + float *x_hat = new float[len_x]; + float *xmu = new float[len_x]; + + for (int ci = 0; ci < len_c; ++ci) { + const float *xc = x + ci; + float *x_hat_c = x_hat + ci; + float *xmu_c = xmu + ci; + for (int xi = 0; xi < len_nhw; ++xi) { + xmu_c[xi * len_c] = xc[xi * len_c] - mean[ci]; + x_hat_c[xi * len_c] = xmu_c[xi * len_c] * invstd[ci]; + } + } + + for (int ci = 0; ci < len_c; ++ci) { + const float *x_hat_c = x_hat + ci; + const float *xmu_c = xmu + ci; + const float *dzc = diff_z + ci; + double dweight = 0, dbias = 0, meandyxmu = 0; + for (int i = 0; i < len_nhw; i++) { + dweight = dweight + x_hat_c[i * len_c] * dzc[i * len_c]; + dbias = dbias + dzc[i * len_c]; + meandyxmu = meandyxmu + xmu_c[i * len_c] * dzc[i * len_c]; + } + if (needs_input_grad0 == true) { + // diff_weight[ci] = dweight; + // diff_bias[ci] = dbias; + sum_dy[ci] = dbias; + sum_dy_xmu[ci] = meandyxmu; + } + if (needs_input_grad1 == true) { + diff_weight[ci] = dweight; + } + if (needs_input_grad2 == true) { + diff_bias[ci] = dbias; + } + } + delete[] x_hat; + delete[] xmu; +} + +void SyncBatchnormBackwardReduceExecutor::cpuCompute() { + int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + int len_x = parser_->getInputDataCount(0); + const bool needs_input_grad0 = parser_->getProtoNode() + ->sync_batchnorm_backward_reduce_param() + .needs_input_grad0(); + const bool needs_input_grad1 = parser_->getProtoNode() + ->sync_batchnorm_backward_reduce_param() + .needs_input_grad1(); + const bool needs_input_grad2 = parser_->getProtoNode() + ->sync_batchnorm_backward_reduce_param() + .needs_input_grad2(); + + auto tensor_dz = cpu_fp32_input_[0]; + auto tensor_x = cpu_fp32_input_[1]; + auto tensor_mean = cpu_fp32_input_[2]; + auto tensor_invstd = cpu_fp32_input_[3]; + + auto tensor_sum_dy = cpu_fp32_output_[0]; + auto tensor_sum_dy_xmu = cpu_fp32_output_[1]; + auto tensor_dweight = cpu_fp32_output_[2]; + auto tensor_dbias = cpu_fp32_output_[3]; + if (needs_input_grad0) { + tensor_sum_dy = cpu_fp32_output_[0]; + tensor_sum_dy_xmu = cpu_fp32_output_[1]; + if (needs_input_grad1) { + tensor_dweight = cpu_fp32_output_[2]; + if (needs_input_grad2) { + tensor_dbias = cpu_fp32_output_[3]; + } + } else { + if (needs_input_grad2) { + tensor_dbias = cpu_fp32_output_[2]; + } + } + } else { + if (needs_input_grad1) { + tensor_dweight = cpu_fp32_output_[0]; + if (needs_input_grad2) { + tensor_dbias = cpu_fp32_output_[1]; + } + } else { + if (needs_input_grad2) { + tensor_dbias = cpu_fp32_output_[0]; + } + } + } + + // const bool needs_input_grad[3] = {1,1,1}; + // call the cup compute function to get:-> grad weight, grad bias, sum_dy, + // sum_dy_xmu + cpuGetSyncBnBkwReduceOuput(tensor_x, tensor_dz, tensor_mean, tensor_invstd, + tensor_dweight, tensor_dbias, tensor_sum_dy, + tensor_sum_dy_xmu, len_x, len_c, needs_input_grad0, + needs_input_grad1, needs_input_grad2); +} + +int64_t SyncBatchnormBackwardReduceExecutor::getTheoryOps() { + int cp_count = 8; + int64_t theory_ops = parser_->getOutputDataCount(0) * cp_count; + VLOG(4) << "getTheoryOps: " << theory_ops << " ops"; + return theory_ops; +} + +} // namespace mluoptest diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.h new file mode 100644 index 0000000000..6bd07950bc --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.h @@ -0,0 +1,37 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_REDUCE_\ +SYNC_BATCHNORM_BACKWARD_REDUCE_H_ +#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_REDUCE_\ +SYNC_BATCHNORM_BACKWARD_REDUCE_H_ +#include "executor.h" + +namespace mluoptest { +class SyncBatchnormBackwardReduceExecutor : public Executor { + public: + SyncBatchnormBackwardReduceExecutor() {} + ~SyncBatchnormBackwardReduceExecutor() {} + + void paramCheck(); + void workspaceMalloc(); + void workspaceFree(); + void compute(); + void cpuCompute(); + int64_t getTheoryOps() override; + + private: + size_t workspace_size_ = 0; +}; + +} // namespace mluoptest +#endif // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_REDUCE_\ +SYNC_BATCHNORM_BACKWARD_REDUCE_H_ diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/test_case/case_0.prototxt new file mode 100644 index 0000000000..d62f473f3b --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/test_case/case_0.prototxt @@ -0,0 +1,122 @@ +op_name: "sync_batchnorm_backward_reduce" +op_type: "SYNC_BATCHNORM_BACKWARD_REDUCE" +input{ + id:"dz" + shape:{ + dims: 55 + dims: 14 + dims: 14 + dims: 2000 + } + layout:LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data:{ + seed:4 + upper_bound:5.5 + lower_bound:0.1 + distribution:UNIFORM + } +} +input{ + id:"x" + shape:{ + dims: 55 + dims: 14 + dims: 14 + dims: 2000 + } + layout:LAYOUT_NHWC + dtype:DTYPE_FLOAT + random_data:{ + seed:233 + upper_bound:5 + lower_bound:0.5 + distribution: UNIFORM + } +} +input{ + id:"mean" + shape:{ + dims: 2000 + } + layout:LAYOUT_ARRAY + dtype:DTYPE_FLOAT + random_data:{ + seed:233 + upper_bound:5 + lower_bound:1 + distribution: UNIFORM + } +} +input{ + id:"invstd" + shape:{ + dims: 2000 + } + layout:LAYOUT_ARRAY + dtype:DTYPE_FLOAT + random_data:{ + seed:233 + upper_bound:8 + lower_bound:0.8 + distribution: UNIFORM + } +} +input{ + id:"weight" + shape:{ + dims: 2000 + } + layout:LAYOUT_ARRAY + dtype:DTYPE_FLOAT + random_data:{ + seed:233 + upper_bound:8 + lower_bound:0.8 + distribution: UNIFORM + } +} +output{ + id:"sum_dy" + shape:{ + dims:2000 + } + layout:LAYOUT_ARRAY + dtype:DTYPE_FLOAT +} +output{ + id: "sum_dy_xmu" + shape:{ + dims:2000 + } + layout:LAYOUT_ARRAY + dtype:DTYPE_FLOAT +} +output{ + id: "dweight" + shape:{ + dims:2000 + } + layout:LAYOUT_ARRAY + dtype:DTYPE_FLOAT +} +output{ + id: "dbias" + shape:{ + dims:2000 + } + layout:LAYOUT_ARRAY + dtype:DTYPE_FLOAT +} +sync_batchnorm_backward_reduce_param: { + needs_input_grad0: true + needs_input_grad1: true + needs_input_grad2: true +} +test_param:{ + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +} diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp new file mode 100644 index 0000000000..4b6619e673 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp @@ -0,0 +1,109 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "sync_batchnorm_elemt.h" + +namespace mluoptest { + +void SyncBatchnormElemtExecutor::paramCheck() { + if (parser_->getInputNum() != 3 && parser_->getInputNum() != 5) { + LOG(ERROR) << "SyncBatchnormElemtExecutor: input number is wrong. "; + } + if (parser_->getOutputNum() != 1) { + LOG(ERROR) << "SyncBatchnormElemtExecutor: output number is wrong. "; + } +} + +void SyncBatchnormElemtExecutor::compute() { + VLOG(4) << "SyncBatchnormElemtExecutor compute begin"; + auto x_desc = tensor_desc_[0].tensor; + auto dev_x = data_vector_[0].device_ptr; + auto mean_desc = tensor_desc_[1].tensor; + auto dev_mean = data_vector_[1].device_ptr; + auto invstd_desc = tensor_desc_[2].tensor; + auto dev_invstd = data_vector_[2].device_ptr; + + if (parser_->getInputNum() == 3) { + auto y_desc = tensor_desc_[3].tensor; + auto dev_y = data_vector_[3].device_ptr; + interface_timer_.start(); + MLUOP_CHECK(mluOpSyncBatchNormElemt( + handle_, x_desc, dev_x, mean_desc, dev_mean, invstd_desc, dev_invstd, + nullptr, nullptr, nullptr, nullptr, y_desc, dev_y)); + interface_timer_.stop(); + } else if (parser_->getInputNum() == 5) { + auto weight_desc = tensor_desc_[3].tensor; + auto dev_weight = data_vector_[3].device_ptr; + auto bias_desc = tensor_desc_[4].tensor; + auto dev_bias = data_vector_[4].device_ptr; + auto y_desc = tensor_desc_[5].tensor; + auto dev_y = data_vector_[5].device_ptr; + interface_timer_.start(); + MLUOP_CHECK(mluOpSyncBatchNormElemt( + handle_, x_desc, dev_x, mean_desc, dev_mean, invstd_desc, dev_invstd, + weight_desc, dev_weight, bias_desc, dev_bias, y_desc, dev_y)); + interface_timer_.stop(); + } + VLOG(4) << "SyncBatchnormElemtExecutor compute end"; +} + +void cpuSyncBNElemt(const float *x, const float *mean, const float *invstd, + float *weight, float *bias, float *y, const int len_x, + const int len_c) { + int len_nhw = len_x / len_c; + + for (int h = 0; h < len_nhw; ++h) { + for (int c = 0; c < len_c; ++c) { + y[h * len_c + c] = (x[h * len_c + c] - mean[c]) * invstd[c]; + if (weight != nullptr && bias != nullptr) { + y[h * len_c + c] = y[h * len_c + c] * weight[c] + bias[c]; + } + } + } +} + +void SyncBatchnormElemtExecutor::cpuCompute() { + int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1]; + int len_x = parser_->getInputDataCount(0); + + VLOG(4) << "SyncBatchnormElemtExecutor: cpu compute begin"; + // actually len_c = 0, then len_x must be 0 + if (len_c == 0 || len_x == 0) { + VLOG(4) << "SyncBatchnormElemtExecutor: cpu compute zero elemt"; + return; + } + + if (parser_->getInputNum() == 3) { + VLOG(4) << "weight and bias is nullptr"; + cpuSyncBNElemt(cpu_fp32_input_[0], cpu_fp32_input_[1], cpu_fp32_input_[2], + nullptr, nullptr, cpu_fp32_output_[0], len_x, len_c); + } else if (parser_->getInputNum() == 5) { + cpuSyncBNElemt(cpu_fp32_input_[0], cpu_fp32_input_[1], cpu_fp32_input_[2], + cpu_fp32_input_[3], cpu_fp32_input_[4], cpu_fp32_output_[0], + len_x, len_c); + } + VLOG(4) << "SyncBatchnormElemtExecutor: cpu compute end"; +} + +int64_t SyncBatchnormElemtExecutor::getTheoryOps() { + int64_t theory_ops = 0; + int len_x = parser_->getInputDataCount(0); + if (parser_->getInputNum() == 3) { + theory_ops = len_x * 2; + } else if (parser_->getInputNum() == 5) { + theory_ops = len_x * 4; + } + VLOG(4) << "SyncBatchnormElemtExecutor: getTheoryOps: " << theory_ops + << " ops"; + return theory_ops; +} + +} // namespace mluoptest diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.h new file mode 100644 index 0000000000..44d37155e6 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.h @@ -0,0 +1,34 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_ELEMT_SYNC_BATCHNORM_ELEMT_H_ +#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_ELEMT_SYNC_BATCHNORM_ELEMT_H_ + +#include "executor.h" + +namespace mluoptest { + +class SyncBatchnormElemtExecutor : public Executor { + public: + SyncBatchnormElemtExecutor() {} + ~SyncBatchnormElemtExecutor() {} + + void paramCheck(); + void compute(); + void cpuCompute(); + int64_t getTheoryOps() override; +}; + +} // namespace mluoptest + +#endif // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_ELEMT_\ +SYNC_BATCHNORM_ELEMT_H_ diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/test_case/case_0.prototxt new file mode 100644 index 0000000000..45772bbba1 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/test_case/case_0.prototxt @@ -0,0 +1,93 @@ +op_name: "sync_batchnorm_elemt" +op_type: "SYNC_BATCHNORM_ELEMT" +input { + id: "x" + shape: { + dims: 4 + dims: 14 + dims: 14 + dims: 1025 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 32 + upper_bound: 2.0 + lower_bound: 1.0 + distribution: UNIFORM + } +} +input { + id: "mean" + shape: { + dims: 1025 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 35 + upper_bound: 1.5 + lower_bound: 0.5 + distribution: UNIFORM + } +} +input { + id: "invstd" + shape: { + dims: 1025 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 36 + upper_bound: 1.25 + lower_bound: 0.25 + distribution: UNIFORM + } +} +input { + id: "weight" + shape: { + dims: 1025 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 33 + upper_bound: 2.0 + lower_bound: -2.0 + distribution: UNIFORM + } +} +input { + id: "bias" + shape: { + dims: 1025 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 34 + upper_bound: 1.0 + lower_bound: -1.0 + distribution: UNIFORM + } +} +output { + id: "y" + shape: { + dims: 4 + dims: 14 + dims: 14 + dims: 1025 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +} diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp new file mode 100644 index 0000000000..e9f2041337 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp @@ -0,0 +1,200 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "sync_batchnorm_gather_stats_with_counts.h" + +namespace mluoptest { + +void SyncBatchnormGatherStatsWithCountsExecutor::paramCheck() { + if (!parser_->getProtoNode() + ->has_sync_batchnorm_gather_stats_with_counts_param()) { + LOG(ERROR) << "Lose sync_batchnorm_gather_stats_with_counts param."; + } + + // set flag + flag_input_reuse_ = false; +} + +void SyncBatchnormGatherStatsWithCountsExecutor::compute() { + float eps = parser_->getProtoNode() + ->sync_batchnorm_gather_stats_with_counts_param() + .eps(); + float momentum = parser_->getProtoNode() + ->sync_batchnorm_gather_stats_with_counts_param() + .momentum(); + + mluOpTensorDescriptor_t mean_all_desc; + mluOpTensorDescriptor_t invstd_all_desc; + mluOpTensorDescriptor_t count_all_desc; + mean_all_desc = tensor_desc_[1].tensor; + invstd_all_desc = tensor_desc_[2].tensor; + + // if num_inputs = 4, then [input, mean_all, invstd_all, count_all] -> [mean, + // invstd] if num_inputs = 6, then [input, mean_all, invstd_all, moving_mean, + // moving_var, count_all] + // -> [moving_mean, moving_var, mean, invstd] + VLOG(4) << "Start to run mluOpSyncBatchNormGatherStatsWithCounts()."; + if (parser_->getInputNum() == 4) { + count_all_desc = tensor_desc_[3].tensor; + mluOpTensorDescriptor_t mean_desc = tensor_desc_[4].tensor; + mluOpTensorDescriptor_t invstd_desc = tensor_desc_[5].tensor; + interface_timer_.start(); + MLUOP_CHECK(mluOpSyncBatchNormGatherStatsWithCounts( + handle_, mean_all_desc, data_vector_[1].device_ptr, invstd_all_desc, + data_vector_[2].device_ptr, nullptr, nullptr, nullptr, nullptr, + momentum, eps, count_all_desc, data_vector_[3].device_ptr, mean_desc, + data_vector_[4].device_ptr, invstd_desc, data_vector_[5].device_ptr)); + interface_timer_.stop(); + } else if (parser_->getInputNum() == 6) { + mluOpTensorDescriptor_t moving_mean_desc = tensor_desc_[3].tensor; + mluOpTensorDescriptor_t moving_var_desc = tensor_desc_[4].tensor; + count_all_desc = tensor_desc_[5].tensor; + if (parser_->getOutputNum() == 2) { + mluOpTensorDescriptor_t mean_desc = tensor_desc_[6].tensor; + mluOpTensorDescriptor_t invstd_desc = tensor_desc_[7].tensor; + interface_timer_.start(); + MLUOP_CHECK(mluOpSyncBatchNormGatherStatsWithCounts( + handle_, mean_all_desc, data_vector_[1].device_ptr, invstd_all_desc, + data_vector_[2].device_ptr, moving_mean_desc, + data_vector_[3].device_ptr, moving_var_desc, + data_vector_[4].device_ptr, momentum, eps, count_all_desc, + data_vector_[5].device_ptr, mean_desc, data_vector_[6].device_ptr, + invstd_desc, data_vector_[7].device_ptr)); + interface_timer_.stop(); + } else { + mluOpTensorDescriptor_t mean_desc = tensor_desc_[8].tensor; + mluOpTensorDescriptor_t invstd_desc = tensor_desc_[9].tensor; + interface_timer_.start(); + MLUOP_CHECK(mluOpSyncBatchNormGatherStatsWithCounts( + handle_, mean_all_desc, data_vector_[1].device_ptr, invstd_all_desc, + data_vector_[2].device_ptr, moving_mean_desc, + data_vector_[3].device_ptr, moving_var_desc, + data_vector_[4].device_ptr, momentum, eps, count_all_desc, + data_vector_[5].device_ptr, mean_desc, data_vector_[8].device_ptr, + invstd_desc, data_vector_[9].device_ptr)); + interface_timer_.stop(); + + data_vector_[3].is_output = true; + data_vector_[4].is_output = true; + data_vector_[6].is_output = false; + data_vector_[7].is_output = false; + } + } +} + +void kahan(float input, float &sum, float &delta) { + float y = input - delta; + float t = sum + y; + delta = t - sum - y; + sum = t; +} + +void cpuBatchNormForwardTraining(float *mean_all, float *invstd_all, + float *moving_mean, float *moving_var, + const float momentum, const float eps, + float *count_all, float *m_mean, float *m_var, + float *mean, float *invstd, + const int len_mean_all, const int len_c, + const int output_num) { + int len_n = len_mean_all / len_c; + int len_all = 0; + for (int i = 0; i < len_n; ++i) { + len_all += count_all[i]; + } + + // B.P.Welford algo + for (int ci = 0; ci < len_c; ++ci) { + float c_sum = 0.0, c_ssum = 0.0; + const float *meanc = mean_all + ci; + const float *invstdc = invstd_all + ci; + float sum = 0.0, ssum = 0.0, temp = 0.0; + for (int xi = 0; xi < len_n; ++xi) { + kahan(meanc[xi * len_c] * count_all[xi], sum, c_sum); + temp = 1.0f / (invstdc[xi * len_c] * invstdc[xi * len_c]) + + meanc[xi * len_c] * meanc[xi * len_c] - eps; + kahan(temp * count_all[xi], ssum, c_ssum); + } + mean[ci] = sum / len_all; + invstd[ci] = 1.0f / sqrt(ssum / len_all - mean[ci] * mean[ci] + eps); + float unbiased_var = + (1.0f / (invstd[ci] * invstd[ci]) - eps) * len_all / (len_all - 1); + if (moving_mean != nullptr && moving_var != nullptr && output_num == 4) { + m_mean[ci] = momentum * mean[ci] + (1 - momentum) * moving_mean[ci]; + m_var[ci] = momentum * unbiased_var + (1 - momentum) * moving_var[ci]; + } + } +} + +void SyncBatchnormGatherStatsWithCountsExecutor::cpuCompute() { + float eps = parser_->getProtoNode() + ->sync_batchnorm_gather_stats_with_counts_param() + .eps(); + float momentum = parser_->getProtoNode() + ->sync_batchnorm_gather_stats_with_counts_param() + .momentum(); + + int idx_c = tensor_desc_[0].tensor->dim - 1; + int len_c = tensor_desc_[0].tensor->dims[idx_c]; + int len_count_all = 1; + int len_mean_all = 1; + int len_invstd_all = 1; + if (parser_->getInputNum() == 4) { + len_count_all = tensor_desc_[3].tensor->dims[0]; + } else if (parser_->getInputNum() == 6) { + len_count_all = tensor_desc_[5].tensor->dims[0]; + } + for (int i = 0; i < tensor_desc_[1].tensor->dim; ++i) { + len_mean_all *= tensor_desc_[1].tensor->dims[i]; + } + for (int i = 0; i < tensor_desc_[2].tensor->dim; ++i) { + len_invstd_all *= tensor_desc_[2].tensor->dims[i]; + } + if (len_mean_all == 0 || len_c == 0 || len_count_all == 0 || + len_mean_all != len_invstd_all) { + return; + } + int output_num = parser_->getOutputNum(); + VLOG(4) << "Start to run cpuBatchNormForwardTraining()."; + if (parser_->getInputNum() == 4) { + cpuBatchNormForwardTraining( + cpu_fp32_input_[1], cpu_fp32_input_[2], nullptr, nullptr, momentum, eps, + cpu_fp32_input_[3], nullptr, nullptr, cpu_fp32_output_[0], + cpu_fp32_output_[1], len_mean_all, len_c, output_num); + } else if (parser_->getInputNum() == 6) { + if (parser_->getOutputNum() == 2) { + cpuBatchNormForwardTraining( + cpu_fp32_input_[1], cpu_fp32_input_[2], cpu_fp32_input_[3], + cpu_fp32_input_[4], momentum, eps, cpu_fp32_input_[5], nullptr, + nullptr, cpu_fp32_output_[0], cpu_fp32_output_[1], len_mean_all, + len_c, output_num); + } else { + cpuBatchNormForwardTraining( + cpu_fp32_input_[1], cpu_fp32_input_[2], cpu_fp32_input_[3], + cpu_fp32_input_[4], momentum, eps, cpu_fp32_input_[5], + cpu_fp32_output_[0], cpu_fp32_output_[1], cpu_fp32_output_[2], + cpu_fp32_output_[3], len_mean_all, len_c, output_num); + } + } +} + +int64_t SyncBatchnormGatherStatsWithCountsExecutor::getTheoryOps() { + int cp_count = 8; + int64_t theory_ops = parser_->getOutputDataCount(0) * cp_count; + VLOG(4) << "getTheoryOps: " << theory_ops << " ops"; + return theory_ops; +} + +std::set +SyncBatchnormGatherStatsWithCountsExecutor::getCriterionsUse() const { + return {Evaluator::DIFF1, Evaluator::DIFF2, Evaluator::DIFF3}; +} + +} // namespace mluoptest diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.h new file mode 100644 index 0000000000..f9f9bfecb3 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.h @@ -0,0 +1,39 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS_\ +SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS_H_ +#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS_\ +SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS_H_ +#include +#include + +#include "executor.h" + +namespace mluoptest { + +class SyncBatchnormGatherStatsWithCountsExecutor : public Executor { + public: + SyncBatchnormGatherStatsWithCountsExecutor() {} + ~SyncBatchnormGatherStatsWithCountsExecutor() {} + + void paramCheck(); + void compute(); + void cpuCompute(); + int64_t getTheoryOps() override; + std::set getCriterionsUse() const override; +}; + +} // namespace mluoptest + +#endif // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS\ +_SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS_H_ diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_0.prototxt new file mode 100644 index 0000000000..de057a5b2e --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_0.prototxt @@ -0,0 +1,118 @@ +op_name: "sync_batchnorm_gather_stats_with_counts" +op_type: "SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS" +input { + id: "input" + shape: { + dims: 8 + dims: 8 + dims: 8 + dims: 2048 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 32 + upper_bound: 50 + lower_bound: -50 + distribution: UNIFORM + } +} +input { + id: "mean_all" + shape: { + dims: 8 + dims: 2048 + } + layout: LAYOUT_NC + dtype: DTYPE_FLOAT + random_data: { + seed: 32 + upper_bound: 50 + lower_bound: -50 + distribution: UNIFORM + } +} +input { + id: "invstd_all" + shape: { + dims: 8 + dims: 2048 + } + layout: LAYOUT_NC + dtype: DTYPE_FLOAT + random_data: { + seed: 33 + upper_bound: 100 + lower_bound: -100 + distribution: UNIFORM + } +} +input { + id: "moving_mean" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 35 + upper_bound: 0.5 + lower_bound: -0.5 + distribution: UNIFORM + } +} +input { + id: "moving_var" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 36 + upper_bound: 0.5 + lower_bound: 0.001 + distribution: UNIFORM + } +} +input { + id: "count_all" + shape: { + dims: 8 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 34 + upper_bound: 50 + lower_bound: 50 + distribution: UNIFORM + } +} +output { + id: "mean" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +output { + id: "invstd" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +sync_batchnorm_gather_stats_with_counts_param: { + eps: 0.00001 + momentum: 0.1 +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +} diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_1.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_1.prototxt new file mode 100644 index 0000000000..0b389c8b49 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_1.prototxt @@ -0,0 +1,134 @@ +op_name: "sync_batchnorm_gather_stats_with_counts" +op_type: "SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS" +input { + id: "input" + shape: { + dims: 8 + dims: 8 + dims: 8 + dims: 2048 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 32 + upper_bound: 50 + lower_bound: -50 + distribution: UNIFORM + } +} +input { + id: "mean_all" + shape: { + dims: 8 + dims: 2048 + } + layout: LAYOUT_NC + dtype: DTYPE_FLOAT + random_data: { + seed: 32 + upper_bound: 50 + lower_bound: -50 + distribution: UNIFORM + } +} +input { + id: "invstd_all" + shape: { + dims: 8 + dims: 2048 + } + layout: LAYOUT_NC + dtype: DTYPE_FLOAT + random_data: { + seed: 33 + upper_bound: 100 + lower_bound: -100 + distribution: UNIFORM + } +} +input { + id: "moving_mean" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 35 + upper_bound: 0.5 + lower_bound: -0.5 + distribution: UNIFORM + } +} +input { + id: "moving_var" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 36 + upper_bound: 0.5 + lower_bound: 0.001 + distribution: UNIFORM + } +} +input { + id: "count_all" + shape: { + dims: 8 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 34 + upper_bound: 50 + lower_bound: 50 + distribution: UNIFORM + } +} +output { + id: "m_mean" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +output { + id: "m_var" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +output { + id: "mean" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +output { + id: "invstd" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +sync_batchnorm_gather_stats_with_counts_param: { + eps: 0.00001 + momentum: 0.1 +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +} diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_2.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_2.prototxt new file mode 100644 index 0000000000..3758739cac --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_2.prototxt @@ -0,0 +1,90 @@ +op_name: "sync_batchnorm_gather_stats_with_counts" +op_type: "SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS" +input { + id: "input" + shape: { + dims: 8 + dims: 8 + dims: 8 + dims: 2048 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 32 + upper_bound: 50 + lower_bound: -50 + distribution: UNIFORM + } +} +input { + id: "mean_all" + shape: { + dims: 8 + dims: 2048 + } + layout: LAYOUT_NC + dtype: DTYPE_FLOAT + random_data: { + seed: 32 + upper_bound: 50 + lower_bound: -50 + distribution: UNIFORM + } +} +input { + id: "invstd_all" + shape: { + dims: 8 + dims: 2048 + } + layout: LAYOUT_NC + dtype: DTYPE_FLOAT + random_data: { + seed: 33 + upper_bound: 100 + lower_bound: -100 + distribution: UNIFORM + } +} +input { + id: "count_all" + shape: { + dims: 8 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 34 + upper_bound: 50 + lower_bound: 50 + distribution: UNIFORM + } +} +output { + id: "mean" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +output { + id: "invstd" + shape: { + dims: 2048 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +sync_batchnorm_gather_stats_with_counts_param: { + eps: 0.00001 + momentum: 0.1 +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +} diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.cpp new file mode 100644 index 0000000000..df9d64683c --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.cpp @@ -0,0 +1,130 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "sync_batchnorm_stats.h" + +namespace mluoptest { + +void SyncBatchnormStatsExecutor::paramCheck() { + if (!parser_->getProtoNode()->has_sync_batchnorm_stats_param()) { + LOG(ERROR) << "Lose sync_batchnorm_stats param."; + } +} + +void SyncBatchnormStatsExecutor::workspaceMalloc() { + auto tensor_x = tensor_desc_[0].tensor; + void *tmp = nullptr; + // allocate extra nram space for deletion of CDMA + MLUOP_CHECK(mluOpGetSyncBatchNormStatsWorkspaceSize(handle_, tensor_x, + &workspace_size_)); + if (workspace_size_ > 0) { + VLOG(4) << "Malloc workspace space for deletion of CDMA."; + tmp = mlu_runtime_.allocate(workspace_size_); + VLOG(4) << "Mallocated addr: " << tmp << ", size: " << workspace_size_; + } else { + VLOG(4) << "Don't need to Malloc workspace space."; + } + workspace_.push_back(tmp); + eva_->setMluWorkspaceSize(workspace_size_); +} + +void SyncBatchnormStatsExecutor::workspaceFree() { + if (workspace_[0]) { + VLOG(4) << "Free device workspace space."; + mlu_runtime_.deallocate(workspace_[0]); + } +} + +void SyncBatchnormStatsExecutor::compute() { + float eps = parser_->getProtoNode()->sync_batchnorm_stats_param().eps(); + + mluOpTensorDescriptor_t x_desc = tensor_desc_[0].tensor; + mluOpTensorDescriptor_t mean_desc = tensor_desc_[1].tensor; + mluOpTensorDescriptor_t invstd_desc = tensor_desc_[2].tensor; + + VLOG(4) << "call mluOpSyncBatchNormStats()"; + interface_timer_.start(); +#if 1 + VLOG(4) << "launch mluOpSyncBatchNormStats_v2."; + MLUOP_CHECK(mluOpSyncBatchNormStats_v2( + handle_, x_desc, data_vector_[0].device_ptr, workspace_[0], + workspace_size_, eps, mean_desc, data_vector_[1].device_ptr, invstd_desc, + data_vector_[2].device_ptr)); +#else + VLOG(4) << "launch mluOpSyncBatchNormStats."; + MLUOP_CHECK(mluOpSyncBatchNormStats( + handle_, x_desc, data_vector_[0].device_ptr, eps, mean_desc, + data_vector_[1].device_ptr, invstd_desc, data_vector_[2].device_ptr)); +#endif + interface_timer_.stop(); +} + +void kahan_stats(float input, float &sum, float &delta) { + float y = input - delta; + float t = sum + y; + delta = t - sum - y; + sum = t; +} + +void cpuSyncBatchNormStats(const float *x, const float eps, float *mean, + float *invstd, const int len_x, const int len_c) { + float len_nhw = len_x / len_c; + + bool flag_free = false; + if (mean == nullptr && invstd == nullptr) { + mean = new float[len_c]; + invstd = new float[len_c]; + flag_free = true; + } + + for (int ci = 0; ci < len_c; ++ci) { + float sum = 0, ssum = 0; + float c_sum = 0.0, c_ssum = 0.0; + const float *xc = x + ci; + for (int xi = 0; xi < len_nhw; ++xi) { + kahan_stats(xc[xi * len_c], sum, c_sum); + kahan_stats(xc[xi * len_c] * xc[xi * len_c], ssum, c_ssum); + } + mean[ci] = sum / len_nhw; + invstd[ci] = 1.0f / sqrt(ssum / len_nhw - (mean[ci] * mean[ci]) + eps); + } + + if (flag_free == true) { + delete[] mean; + delete[] invstd; + } +} + +void SyncBatchnormStatsExecutor::cpuCompute() { + float eps = parser_->getProtoNode()->sync_batchnorm_stats_param().eps(); + + int idx_c = tensor_desc_[0].tensor->dim - 1; + int len_c = tensor_desc_[0].tensor->dims[idx_c]; + int len_x = 1; + for (int i = 0; i < tensor_desc_[0].tensor->dim; ++i) { + len_x *= tensor_desc_[0].tensor->dims[i]; + } + if (len_x == 0 || len_c == 0) { + return; + } + VLOG(4) << "Start to run cpuSyncBatchNormStats()."; + cpuSyncBatchNormStats(cpu_fp32_input_[0], eps, cpu_fp32_output_[0], + cpu_fp32_output_[1], len_x, len_c); +} + +int64_t SyncBatchnormStatsExecutor::getTheoryOps() { + int cp_count = 8; + int64_t theory_ops = parser_->getOutputDataCount(0) * cp_count; + VLOG(4) << "getTheoryOps: " << theory_ops << " ops"; + return theory_ops; +} + +} // namespace mluoptest diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.h new file mode 100644 index 0000000000..beeb497d7b --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.h @@ -0,0 +1,39 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_STATS_SYNC_BATCHNORM_STATS_H_ +#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_STATS_SYNC_BATCHNORM_STATS_H_ + +#include "executor.h" + +namespace mluoptest { + +class SyncBatchnormStatsExecutor : public Executor { + public: + SyncBatchnormStatsExecutor() {} + ~SyncBatchnormStatsExecutor() {} + + void paramCheck(); + void workspaceMalloc(); + void workspaceFree(); + void compute(); + void cpuCompute(); + int64_t getTheoryOps() override; + + private: + size_t workspace_size_ = 0; +}; + +} // namespace mluoptest + +#endif // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_STATS_SYNC_\ +BATCHNORM_STATS_H_ diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/test_case/case_0.prototxt new file mode 100644 index 0000000000..1b346a3acf --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/test_case/case_0.prototxt @@ -0,0 +1,45 @@ +op_name: "sync_batchnorm_stats" +op_type: "SYNC_BATCHNORM_STATS" +input { + id: "x" + shape: { + dims: 4 + dims: 35 + dims: 35 + dims: 960 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 32 + upper_bound: 2.83 + lower_bound: -0.5 + distribution: UNIFORM + } +} +output { + id: "mean" + shape: { + dims: 960 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +output { + id: "invstd" + shape: { + dims: 960 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +sync_batchnorm_stats_param: { + eps: 0.00001 +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +} diff --git a/docs/bangc-docs/user_guide/9_operators/index.rst b/docs/bangc-docs/user_guide/9_operators/index.rst index 0788171c9c..8833846530 100644 --- a/docs/bangc-docs/user_guide/9_operators/index.rst +++ b/docs/bangc-docs/user_guide/9_operators/index.rst @@ -920,3 +920,33 @@ mluOpDynamicPointToVoxelForward 1)将体素坐标 `coors` 进行排序、去重,得到新的体素坐标 `voxel_coors`; 保存去重后体素的个数 ``num_voxels`` 到 `voxel_num`; 保存 `coors` 中每个体素坐标在 `voxel_coors` 中对应的索引到 `point2voxel_map`; 保存 `voxel_coors` 中每个体素坐标在 `coors` 中出现的个数到 `voxel_points_count`; 2)遍历 `feats` 中每个点,在特征维度上,对每个值根据 `reduce_type` 的方法进行计算,将结果保存到 `voxel_feats` 中; 当 `reduce_type` = ``max``, 在特征维度上对每个值取最大的值; 当 `reduce_type` = ``mean``, 将特征维度每个值都累加到 `voxel_feats` 对应位置中,再利用 `voxel_points_count` 获取该体素位置在原始体素中出现的个数,再对 `voxel_feats` 的特征维度求平均。 + +.. _sync_batchnorm_stats: + +mluOpSyncBatchNormStats +--------------------------------- +该算子用来计算单卡上SyncBatchNorm的均值和标准差的倒数。 + +.. _sync_batchnorm_gather_stats_with_counts: + +mluOpSyncBatchNormGatherStatsWithCounts +--------------------------------- +该算子用来计算SyncBatchNorm的全局均值和标准差的倒数。 + +.. _sync_batchnorm_elemt: + +mluOpSyncBatchNormElemt +--------------------------------- +该算子用来计算SyncBatchNorm的前向输出。 + +.. _sync_batchnorm_backward_reduce: + +mluOpSyncBatchnormBackwardReduce +--------------------------------- +该算子用来计算损失函数想对于weight和bias的梯度,以及根据开关情况决定是否输出下级element函数的中间变量`sum_dy`和`sum_dy_xmu`。本算子通过多卡通信的方式,解决sync_batchnorm_backward在单卡上batch size数据过大导致训练时间较长的问题。 + +.. _sync_batch_norm_backward_elemt: + +mluOpSyncBatchNormBackwardElemt +--------------------------------- +该算子用来计算输入的梯度,与mluOpSyncBatchnormBackwardReduce共同实现了sync_batchnorm_backward。