diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt index d1aeed4f51a16..e925f75090a46 100644 --- a/ThirdPartyNotices.txt +++ b/ThirdPartyNotices.txt @@ -5239,34 +5239,6 @@ PERFORMANCE OF THIS SOFTWARE. _____ -microsoft/vcpkg, https://github.com/microsoft/vcpkg - -Copyright (c) Microsoft Corporation - -All rights reserved. - -MIT License - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -_____ - openssl/openssl, https://github.com/openssl/openssl Apache License diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 567fe2255df46..378647f273ab9 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -282,7 +282,7 @@ "component": { "type": "git", "git": { - "commitHash": "28cf67e5b64c704cad993c71f29a24e781bee544", + "commitHash": "f412df7a2b64421e1f1d61fde6055a6ea288e8f5", "repositoryUrl": "https://github.com/microsoft/mimalloc.git" }, "comments": "mimalloc" @@ -408,16 +408,6 @@ "comments": "cutlass" } }, - { - "component": { - "type": "git", - "git": { - "commitHash": "6f7ffeb18f99796233b958aaaf14ec7bd4fb64b2", - "repositoryUrl": "https://github.com/microsoft/vcpkg.git" - }, - "comments": "vcpkg" - } - }, { "component": { "type": "git", diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index b315b346f7b05..a5d28fb516e8e 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -622,6 +622,7 @@ if (onnxruntime_USE_CUDA) list(APPEND ORT_PROVIDER_FLAGS -DUSE_FLASH_ATTENTION=1) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_FLASH_ATTENTION=1) endif() + endif() if (onnxruntime_USE_VITISAI) list(APPEND ORT_PROVIDER_FLAGS -DUSE_VITISAI=1) diff --git a/cmake/deps.txt b/cmake/deps.txt index 3a1a691985ea1..d16245ba833cb 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -21,7 +21,7 @@ googlexnnpack;https://github.com/google/XNNPACK/archive/003c580e696a774afdc98499 json;https://github.com/nlohmann/json/archive/refs/tags/v3.10.5.zip;f257f8dc27c5b8c085dc887b40cddd18ae1f725c microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14 microsoft_wil;https://github.com/microsoft/wil/archive/5f4caba4e7a9017816e47becdd918fcc872039ba.zip;fd119887d0d17c37adf1fc227b054befa28158ad -mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.0.9.zip;9d4205c93805b5525de57c6c7ed7f60e770ffdac +mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.0.3.zip;e4f37b93b2da78a5816c2495603a4188d316214b mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.79.0.zip;c8f04e378535ededbe5af52c8f969d2dedbe73d5 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.13.0.zip;8dda5079cdb5a134b08b0c73f4592a6404fc2dc6 #use the commit where it's several commits after 8.5-GA branch (https://github.com/onnx/onnx-tensorrt/commit/369d6676423c2a6dbf4a5665c4b5010240d99d3c) @@ -36,7 +36,6 @@ safeint;https://github.com/dcleblanc/SafeInt/archive/ff15c6ada150a5018c5ef217240 tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381 cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v2.11.0.zip;be70c559f07251ba7f33c789dba98872b444c10f # below are deps introduced by triton client, might remove after 1.14 release -vcpkg;https://github.com/microsoft/vcpkg/archive/refs/tags/2022.11.14.zip;3f983141351af5db2d6c3ca965959845f27d5d51 openssl;https://github.com/openssl/openssl/archive/refs/tags/openssl-3.0.7.zip;dda8fc81308555410505eb4a9eab3e1da0436a1d rapidjson;https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.zip;0fe7b4f7b83df4b3d517f4a202f3a383af7a0818 boost;https://github.com/boostorg/boost/archive/refs/tags/boost-1.81.0.zip;f6ab0da855f825b4eb1abd949967d01a4c5e4e1b diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index c07c60b486e4f..ca6d12291eec5 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -678,10 +678,11 @@ if (onnxruntime_USE_TENSORRT) target_compile_options(nvonnxparser_static PRIVATE /FIio.h /wd4100) target_compile_options(nvonnxparser PRIVATE /FIio.h /wd4100) endif() - include_directories(${TENSORRT_INCLUDE_DIR}) set(onnxparser_link_libs nvonnxparser_static) endif() + include_directories(${TENSORRT_INCLUDE_DIR}) + set(trt_link_libs cudnn cublas ${CMAKE_DL_LIBS} ${TENSORRT_LIBRARY}) file(GLOB_RECURSE onnxruntime_providers_tensorrt_cc_srcs CONFIGURE_DEPENDS @@ -699,11 +700,10 @@ if (onnxruntime_USE_TENSORRT) add_dependencies(onnxruntime_providers_tensorrt onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES}) if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER) target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS}) - target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) else() target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS}) - target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) endif() + target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) if(onnxruntime_CUDNN_HOME) target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDNN_HOME}/include) endif() diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index 809a076443609..c24b6b9be548a 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -467,12 +467,21 @@ file(GLOB onnxruntime_python_quantization_cal_table_flatbuffers_src CONFIGURE_DE file(GLOB onnxruntime_python_transformers_src CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/python/tools/transformers/*.py" ) +file(GLOB onnxruntime_python_transformers_models_bart_src CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/bart/*.py" +) +file(GLOB onnxruntime_python_transformers_models_bert_src CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/bert/*.py" +) file(GLOB onnxruntime_python_transformers_models_gpt2_src CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/gpt2/*.py" ) file(GLOB onnxruntime_python_transformers_models_longformer_src CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/longformer/*.py" ) +file(GLOB onnxruntime_python_transformers_models_stable_diffusion_src CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/stable_diffusion/*.py" +) file(GLOB onnxruntime_python_transformers_models_t5_src CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/t5/*.py" ) @@ -526,8 +535,11 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/tools/ort_format_model/ort_flatbuffers_py COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models + COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/bart + COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/bert COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/gpt2 COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/longformer + COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/stable_diffusion COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/t5 COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/quantization COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/quantization/operators @@ -606,12 +618,21 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy ${onnxruntime_python_transformers_src} $/onnxruntime/transformers/ + COMMAND ${CMAKE_COMMAND} -E copy + ${onnxruntime_python_transformers_models_bart_src} + $/onnxruntime/transformers/models/bart/ + COMMAND ${CMAKE_COMMAND} -E copy + ${onnxruntime_python_transformers_models_bert_src} + $/onnxruntime/transformers/models/bert/ COMMAND ${CMAKE_COMMAND} -E copy ${onnxruntime_python_transformers_models_gpt2_src} $/onnxruntime/transformers/models/gpt2/ COMMAND ${CMAKE_COMMAND} -E copy ${onnxruntime_python_transformers_models_longformer_src} $/onnxruntime/transformers/models/longformer/ + COMMAND ${CMAKE_COMMAND} -E copy + ${onnxruntime_python_transformers_models_stable_diffusion_src} + $/onnxruntime/transformers/models/stable_diffusion/ COMMAND ${CMAKE_COMMAND} -E copy ${onnxruntime_python_transformers_models_t5_src} $/onnxruntime/transformers/models/t5/ diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index ab46097856bb0..d23f0e0e2180e 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -768,9 +768,9 @@ Do not modify directly.* |||1+|**T** = tensor(double), tensor(float), tensor(float16)| |Tile|*in* input:**T**
*in* repeats:**T1**
*out* output:**T**

or

*in* input:**T**
*in* tiles:**T**
*in* axis:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)
**T1** = tensor(int64)| |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)
**T1** = tensor(int64)| -|TopK|*in* X:**T**
*in* K:**tensor(int64)**
*out* Values:**T**
*out* Indices:**I**

or

*in* X:**T**
*out* Values:**T**
*out* Indices:**I**|11+|**I** = tensor(int64)
**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| -|||10|**I** = tensor(int64)
**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| -|||[1, 9]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| +|TopK|*in* X:**T**
*in* K:**tensor(int64)**
*out* Values:**T**
*out* Indices:**I**

or

*in* X:**T**
*out* Values:**T**
*out* Indices:**I**|11+|**I** = tensor(int64)
**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)| +|||10|**I** = tensor(int64)
**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)| +|||[1, 9]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)| |Transpose|*in* data:**T**
*out* transposed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |Trilu|*in* input:**T**
*in* k:**tensor(int64)**
*out* output:**T**|14+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h index e5a84e7aa79f3..5444c825d7991 100644 --- a/include/onnxruntime/core/framework/run_options.h +++ b/include/onnxruntime/core/framework/run_options.h @@ -27,10 +27,6 @@ struct OrtRunOptions { // So it is possible that only some of the nodes are executed. bool only_execute_path_to_fetches = false; - // Set to 'true' to synchronize execution providers with CPU at the end of session run. - // Taking CUDA EP as an example, it will trigger cudaStreamSynchronize on the compute stream. - bool synchronize_execution_providers = true; - #ifdef ENABLE_TRAINING // Used by onnxruntime::training::TrainingSession. This class is now deprecated. // Delete training_mode when TrainingSession is deleted. diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h index 49b46ca077b75..1f5fcd50e185c 100644 --- a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h @@ -25,3 +25,8 @@ // Example usage: "cpu:0;gpu:0" (or) "gpu:0" // By default, the value for this key is empty (i.e.) no memory arenas are shrunk static const char* const kOrtRunOptionsConfigEnableMemoryArenaShrinkage = "memory.enable_memory_arena_shrinkage"; + +// Set to '1' to not synchronize execution providers with CPU at the end of session run. +// Per default it will be set to '0' +// Taking CUDA EP as an example, it omit triggering cudaStreamSynchronize on the compute stream. +static const char* const kOrtRunOptionsConfigDisableSynchronizeExecutionProviders = "disable_synchronize_execution_providers"; diff --git a/js/web/package.json b/js/web/package.json index b18c86ae37e6d..6d0236150e625 100644 --- a/js/web/package.json +++ b/js/web/package.json @@ -8,7 +8,6 @@ "type": "git" }, "author": "fs-eire", - "module": "./lib/index.js", "version": "1.14.0", "jsdelivr": "dist/ort.min.js", "dependencies": { diff --git a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu index e86736726c224..8f271ecfcbfa8 100644 --- a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu +++ b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu @@ -519,6 +519,7 @@ void InvokeAddBiasTranspose( cudaStream_t stream, const int num_matrices, const int format, const int max_threads_per_block, const int batch_size, const int sequence_length, const int num_heads, const int qk_head_size, const T* input, const T* biases, T* output, T* qkv_add_bias, const int v_head_size, int total_matrix_count) { + assert(num_heads <= max_threads_per_block); const dim3 grid(sequence_length, batch_size, num_matrices); if (qk_head_size * num_heads <= max_threads_per_block) { const dim3 block(qk_head_size, num_heads, 1); @@ -544,7 +545,7 @@ void InvokeAddBiasTranspose( AddBiasTranspose<<>>(input, biases, output); } } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); if (format == 2) { AddBiasTransposeTrtLarge<<>>(qk_head_size, input, biases, output); } else if (format == 1) { @@ -577,7 +578,7 @@ void LaunchAddBiasTranspose( const half* input, const half* biases, half* output, bool enable_half4, const int v_head_size, half* qkv_add_bias, int total_matrix_count) { total_matrix_count = std::max(num_matrices, total_matrix_count); - if (enable_half4 && 0 == (qk_head_size % 4) && 0 == (v_head_size % 4)) { + if (enable_half4 && 0 == (qk_head_size % 4) && (v_head_size == -1 || 0 == (v_head_size % 4))) { const int H = qk_head_size / 4; const int H_v = v_head_size / 4; const Half4* input2 = reinterpret_cast(input); @@ -587,7 +588,7 @@ void LaunchAddBiasTranspose( InvokeAddBiasTranspose(stream, num_matrices, format, max_threads_per_block, batch_size, sequence_length, num_heads, H, input2, biases2, output2, qkv_add_bias2, H_v, total_matrix_count); - } else if (0 == (qk_head_size & 1) && 0 == (v_head_size & 1)) { + } else if (0 == (qk_head_size & 1) && (v_head_size == -1 || 0 == (v_head_size & 1))) { const int H = qk_head_size / 2; const int H_v = v_head_size / 2; const half2* input2 = reinterpret_cast(input); @@ -612,7 +613,7 @@ void LaunchAddBiasTranspose( const float* input, const float* biases, float* output, bool /*enable_half4*/, const int v_head_size, float* qkv_add_bias, int total_matrix_count) { total_matrix_count = std::max(num_matrices, total_matrix_count); - if (0 == (qk_head_size % 4) && 0 == (v_head_size % 4)) { + if (0 == (qk_head_size % 4) && (v_head_size == -1 || 0 == (v_head_size % 4))) { const int H = qk_head_size / 4; const float4* input2 = reinterpret_cast(input); const float4* biases2 = reinterpret_cast(biases); @@ -622,7 +623,7 @@ void LaunchAddBiasTranspose( stream, num_matrices, format, max_threads_per_block, batch_size, sequence_length, num_heads, H, input2, biases2, output2, qkv_add_bias2, v_head_size / 4, total_matrix_count); - } else if (0 == (qk_head_size & 1) && 0 == (v_head_size & 1)) { + } else if (0 == (qk_head_size & 1) && (v_head_size == -1 || 0 == (v_head_size & 1))) { const int H = qk_head_size / 2; const float2* input2 = reinterpret_cast(input); const float2* biases2 = reinterpret_cast(biases); @@ -654,7 +655,7 @@ void InvokeAddBiasTransposeTrt( const dim3 block(head_size, num_heads, 1); AddBiasTransposeTrt<<>>(query, key, value, biases, output); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtLarge<<>>(head_size, query, key, value, biases, output); } } else { // cross attention @@ -666,7 +667,7 @@ void InvokeAddBiasTransposeTrt( const dim3 block(head_size, num_heads, 1); AddBiasTransposeTrt<<>>(query, biases, output); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtLarge<<>>(head_size, query, biases, output); } } @@ -680,7 +681,7 @@ void InvokeAddBiasTransposeTrt( const dim3 block(head_size, num_heads, 1); AddBiasTransposeTrtKV<<>>(key, value, biases, packed_kv); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtKVLarge<<>>(head_size, key, value, biases, packed_kv); } } @@ -737,6 +738,7 @@ void InvokeAddBias( const int batch_size, const int sequence_length, const int kv_sequence_length, const int num_heads, const int head_size, const int v_head_size, const T* biases, const T* query, const T* key, const T* value, T* q, T* k, T* v) { + assert(num_heads <= max_threads_per_block); constexpr int num_matrices = 1; // Q { @@ -745,7 +747,7 @@ void InvokeAddBias( const dim3 block(head_size, num_heads, 1); AddBiasTransposeTrt<<>>(query, biases, q); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtLarge<<>>(head_size, query, biases, q); } } @@ -758,7 +760,7 @@ void InvokeAddBias( const dim3 block(head_size, num_heads, 1); AddBiasTransposeTrt<<>>(key, biases_k, k); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtLarge<<>>(head_size, key, biases_k, k); } } @@ -772,7 +774,7 @@ void InvokeAddBias( const dim3 block(v_head_size, num_heads, 1); AddBiasTransposeTrt<<>>(value, biases_v, v); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtLarge<<>>(v_head_size, value, biases_v, v); } } diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu index 5c54c03a05d1a..dcbc733f2acb2 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu @@ -291,7 +291,7 @@ void LaunchBatchTopKKernel(const T* topk_scores, int32_t num_beams, int32_t k, cudaStream_t stream) { - ORT_ENFORCE(k <= 256, "LaunchBatchTopKKernel doesn't support k >= 256"); + ORT_ENFORCE(k <= 64, "LaunchBatchTopKKernel doesn't support k >= 64"); #define BatchTopKKernelLauncher(K) \ BatchTopKKernel<<>>(topk_scores, \ @@ -311,12 +311,8 @@ void LaunchBatchTopKKernel(const T* topk_scores, BatchTopKKernelLauncher(16); } else if (k <= 32) { BatchTopKKernelLauncher(32); - } else if (k <= 64) { - BatchTopKKernelLauncher(64); - } else if (k <= 128) { - BatchTopKKernelLauncher(128); } else { - BatchTopKKernelLauncher(256); + BatchTopKKernelLauncher(64); } } @@ -330,36 +326,6 @@ template void LaunchBatchTopKKernel(const float* topk_scores, int32_t k, cudaStream_t stream); -template void LaunchBatchTopKKernel(const float* topk_scores, - const int64_t* topk_tokens, - int32_t* next_indices, - int32_t* next_tokens, - float* next_scores, - int32_t batch_size, - int32_t num_beams, - int32_t k, - cudaStream_t stream); - -template void LaunchBatchTopKKernel(const half* topk_scores, - const int32_t* topk_tokens, - int32_t* next_indices, - int32_t* next_tokens, - half* next_scores, - int32_t batch_size, - int32_t num_beams, - int32_t k, - cudaStream_t stream); - -template void LaunchBatchTopKKernel(const half* topk_scores, - const int64_t* topk_tokens, - int32_t* next_indices, - int32_t* next_tokens, - half* next_scores, - int32_t batch_size, - int32_t num_beams, - int32_t k, - cudaStream_t stream); - template void BeamSearchTopK( const T* input, @@ -426,21 +392,6 @@ template void BeamSearchTopK( int32_t* output_indices, cudaStream_t stream); -template void BeamSearchTopK( - const half* input, - int32_t batch_size, - int32_t num_beams, - int32_t vocab_size, - int32_t k, - half* tmp_values_1st_stage, - int32_t* tmp_indices_1st_stage, - half* tmp_values_2st_stage, - int32_t* tmp_indices_2st_stage, - half* output_values, - int32_t* output_tokens, - int32_t* output_indices, - cudaStream_t stream); - } // namespace cuda } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.h b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.h index 5e338b417e8a5..096448c002e36 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.h +++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.h @@ -11,18 +11,6 @@ namespace onnxruntime { namespace contrib { namespace cuda { -template -void LaunchBatchTopKKernel( - const T* topk_scores, - const I* topk_indices, - int32_t* next_indices, - int32_t* next_tokens, - T* next_scores, - int32_t batch_size, - int32_t num_beams, - int32_t k, - cudaStream_t stream); - template void BeamSearchTopK( const T* input, diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc index 1a5a9ac5d97b2..79dc470c693e1 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc @@ -430,12 +430,16 @@ Status ProcessLogits(const OrtValue& logits, // dumper->Print("next_indices before scorer", beam_state->next_indices.data(), batch_size, 2 * num_beams); dumper->Print("next_scores before scorer", beam_state->next_scores.data(), batch_size, 2 * num_beams); #endif + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cpu_state->topk_scores.data(), + beam_state->next_scores.data(), + beam_state->next_scores.size_bytes(), + cudaMemcpyDeviceToHost, + cuda_stream)); } else { // Apply top-k selection like the following: // next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) // next_token_scores, next_tokens = torch.topk(next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True) - // int64_t next_token_scores_dims[] = {batch_size, num_beams * vocab_size}; - int64_t next_token_scores_dims[] = {batch_size * num_beams, vocab_size}; + int64_t next_token_scores_dims[] = {batch_size, num_beams * vocab_size}; TensorShape next_token_scores_shape(&next_token_scores_dims[0], 2); auto element_type = DataTypeImpl::GetType(); @@ -450,31 +454,36 @@ Status ProcessLogits(const OrtValue& logits, // constexpr bool sorted = true; // results returned in sorted order. std::unique_ptr topk_scores = Tensor::CreateDefault(); - std::unique_ptr topk_tokens = Tensor::CreateDefault(); + std::unique_ptr topk_indices = Tensor::CreateDefault(); ORT_RETURN_IF_ERROR(TopK(&input, axis, top_k, largest, sorted, allocator, ort_stream, thread_pool, - *topk_scores, *topk_tokens)); + *topk_scores, *topk_indices)); #ifdef DEBUG_GENERATION dumper->Print("topk_scores", *(topk_scores.get())); - dumper->Print("topk_tokens", *(topk_tokens.get())); + dumper->Print("topk_indices", *(topk_indices.get())); +#endif + + // Convert indices in range [0, num_beams * vocab_size) to token ID of range [0, vocab_size) like the following: + // next_indices = (next_tokens / vocab_size).long() + // next_tokens = next_tokens % vocab_size + const int64_t* next_token_indices = topk_indices->Data(); + cuda::LaunchNextTokenKernel(next_token_indices, beam_state->next_indices.data(), beam_state->next_tokens.data(), + batch_size, top_k, vocab_size, cuda_stream); + + const float* data = topk_scores->Data(); +#ifdef DEBUG_GENERATION + dumper->Print("next_scores before scorer", data, batch_size, top_k); + dumper->Print("next_tokens before scorer", beam_state->next_tokens.data(), batch_size, top_k); + dumper->Print("next_indices before scorer", beam_state->next_indices.data(), batch_size, top_k); #endif - cuda::LaunchBatchTopKKernel(topk_scores->Data(), - topk_tokens->Data(), - beam_state->next_indices.data(), - beam_state->next_tokens.data(), - beam_state->next_scores.data(), - batch_size, - num_beams, - 2 * num_beams, - cuda_stream); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cpu_state->topk_scores.data(), + data, + topk_scores->SizeInBytes(), + cudaMemcpyDeviceToHost, + cuda_stream)); } - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cpu_state->topk_scores.data(), - beam_state->next_scores.data(), - beam_state->next_scores.size_bytes(), - cudaMemcpyDeviceToHost, - cuda_stream)); CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cpu_state->topk_tokens.data(), beam_state->next_tokens.data(), beam_state->next_tokens.size_bytes(), diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc index 79691d7b516dc..f88d098454479 100644 --- a/onnxruntime/core/framework/utils.cc +++ b/onnxruntime/core/framework/utils.cc @@ -20,6 +20,8 @@ #include "core/framework/tensorprotoutils.h" #include "core/mlas/inc/mlas.h" #include "core/framework/TensorSeq.h" +#include "core/framework/run_options.h" +#include "core/session/onnxruntime_run_options_config_keys.h" #ifdef USE_AZURE #include "core/framework/cloud_executor.h" #endif @@ -793,13 +795,14 @@ common::Status ExecuteGraph(const SessionState& session_state, logger); } #endif + bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0"; return ExecuteGraph(session_state, feeds_fetches_manager, feeds, fetches, execution_mode, run_options.terminate, logger, - run_options.synchronize_execution_providers, + synchronize_execution_providers, run_options.only_execute_path_to_fetches); } diff --git a/onnxruntime/core/optimizer/bias_softmax_fusion.cc b/onnxruntime/core/optimizer/bias_softmax_fusion.cc index 80603cdbd3270..7c34449d583cc 100755 --- a/onnxruntime/core/optimizer/bias_softmax_fusion.cc +++ b/onnxruntime/core/optimizer/bias_softmax_fusion.cc @@ -135,6 +135,7 @@ bool TrySelectInputAndBiasWithAlignment(Node& add_node, Node& softmax_node, Node new_axis = (int)HandleNegativeAxis(axis, rank); // The axis attribute for Softmax in OpSet-11 and OpSet-13 are different. + // Details in function documentatin. if (is_since_opset_13 && new_axis != rank - 1) return false; int singlebatch_rank = rank - new_axis; diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index fdee3c19f2e8e..53545c66508c5 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -200,7 +200,6 @@ InlinedVector> GenerateTransformers( // CSE. For example, if A and B nodes both do Add operation with a same value but different initializers, by // default, CSE will not merge them, because the different initializers are represented by different NodeArg. if (session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableDoubleQDQRemover, "0") == "0"){ - transformers.emplace_back(std::make_unique()); transformers.emplace_back(std::make_unique()); } transformers.emplace_back(std::make_unique()); diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc index 9895918dd2653..25feb5b8d702c 100644 --- a/onnxruntime/core/optimizer/layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc @@ -4,6 +4,7 @@ #include "core/optimizer/layer_norm_fusion.h" #include "core/graph/graph_utils.h" #include "core/optimizer/utils.h" +#include "core/optimizer/transpose_optimizer/optimizer_api.h" #include "float.h" #include @@ -16,12 +17,17 @@ static constexpr std::array supported_data_types{"tensor(fl // Default epsilon static constexpr float DEFAULT_LAYERNORM_EPSILON = 1e-5f; -static bool IsSupportedDataType(const Node& node) { +static bool IsSupportedDataType(const Node& node, int first_n_inputs=-1) { + int input_index = 0; for (const auto& input_arg : node.InputDefs()) { + if (first_n_inputs != -1 && input_index >= first_n_inputs) { + return true; + } if (std::find(supported_data_types.begin(), supported_data_types.end(), *(input_arg->Type())) == supported_data_types.end()) { return false; } + ++input_index; } return true; } @@ -99,11 +105,11 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, Node& reduce_mean_node = *p_reduce_mean; ORT_RETURN_IF_ERROR(Recurse(reduce_mean_node, modified, graph_level, logger)); - if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean_node, "ReduceMean", {1, 11, 13}) || + if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean_node, "ReduceMean", {1, 11, 13, 18}) || !graph_utils::IsSupportedProvider(reduce_mean_node, GetCompatibleExecutionProviders()) || (reduce_mean_node.GetOutputEdgesCount() != 1 && reduce_mean_node.GetOutputEdgesCount() != 2) || graph.NodeProducesGraphOutput(reduce_mean_node) || - !IsSupportedDataType(reduce_mean_node)) { + !IsSupportedDataType(reduce_mean_node, 1)) { continue; } nodes_to_remove.push_back(reduce_mean_node); @@ -263,10 +269,10 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, continue; } Node& reduce_mean2_node = *graph.GetNode(p_reduce_mean2->Index()); - if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean2_node, "ReduceMean", {1, 11, 13}) || + if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean2_node, "ReduceMean", {1, 11, 13, 18}) || reduce_mean2_node.GetExecutionProviderType() != reduce_mean_node.GetExecutionProviderType() || !optimizer_utils::CheckOutputEdges(graph, reduce_mean2_node, 1) || - !IsSupportedDataType(reduce_mean2_node) || + !IsSupportedDataType(reduce_mean2_node, 1) || reduce_mean2_node.GetInputEdgesCount() == 0) { continue; } @@ -333,8 +339,16 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, // get axes attributes const onnxruntime::NodeAttributes& attributes = reduce_mean_node.GetAttributes(); std::vector axes_values; + // TODO: modify this codes when opset >= 18 (axes is an input). if (attributes.find("axes") != attributes.end()) { axes_values = RetrieveValues(attributes.at("axes")); + } else if (reduce_mean_node.InputDefs().size() == 2) { + auto axes = reduce_mean_node.InputDefs()[1]; + auto axes_const = graph.GetConstantInitializer(axes->Name(), true); + if (axes_const != nullptr) { + Initializer initializer{*axes_const, graph.ModelPath()}; + axes_values.insert(axes_values.end(), initializer.DataAsSpan().begin(), initializer.DataAsSpan().end()); + } } // Get the inputs for the new LayerNormalization node. @@ -485,9 +499,9 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr continue; } Node& reduce_mean_node = *graph.GetNode(p_reduce_mean->Index()); - if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean_node, "ReduceMean", {1, 11, 13}) || + if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean_node, "ReduceMean", {1, 11, 13, 18}) || reduce_mean_node.GetExecutionProviderType() != pow_node.GetExecutionProviderType() || - !optimizer_utils::CheckOutputEdges(graph, reduce_mean_node, 1) || !IsSupportedDataType(reduce_mean_node) || + !optimizer_utils::CheckOutputEdges(graph, reduce_mean_node, 1) || !IsSupportedDataType(reduce_mean_node, 1) || reduce_mean_node.GetInputEdgesCount() == 0) { continue; } @@ -585,6 +599,13 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr std::vector axes_values; if (attributes.find("axes") != attributes.end()) { axes_values = RetrieveValues(attributes.at("axes")); + } else if (reduce_mean_node.InputDefs().size() == 2) { + auto axes = reduce_mean_node.InputDefs()[1]; + auto axes_const = graph.GetConstantInitializer(axes->Name(), true); + if (axes_const != nullptr && axes_const->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT64) { + Initializer initializer{*axes_const, graph.ModelPath()}; + axes_values.insert(axes_values.end(), initializer.DataAsSpan().begin(), initializer.DataAsSpan().end()); + } } // Get the inputs for the new LayerNormalization node. diff --git a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc index 0ac7cbb8fa058..700c91ab85974 100644 --- a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc +++ b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc @@ -1040,7 +1040,7 @@ static bool HandlePad(HandlerArgs& args) { constexpr HandlerInfo pad_handler = {&FirstInput, &HandlePad}; -static bool HandleReduceOp(HandlerArgs& args) { +static bool HandleReduceOpWithArg(HandlerArgs& args) { int64_t keepdims = args.node.GetAttributeIntDefault("keepdims", 1); std::optional> axes = args.node.GetAttributeInts("axes"); @@ -1078,11 +1078,11 @@ static bool HandleReduceOp(HandlerArgs& args) { return true; } -constexpr HandlerInfo reduce_op_handler = {&FirstInput, &HandleReduceOp}; - -static bool HandleReduceSum(HandlerArgs& args) { - if (args.ctx.opset < 13) { - return HandleReduceOp(args); +static bool HandleReduceOps(HandlerArgs& args) { + if ((args.node.OpType() == "ReduceSum" && args.ctx.opset < 13) || + // or all other reduce operators since opset 18 + (args.node.OpType() != "ReduceSum" && args.ctx.opset < 18)) { + return HandleReduceOpWithArg(args); } bool keepdims = args.node.GetAttributeIntDefault("keepdims", 1) != 0; @@ -1147,7 +1147,7 @@ static bool HandleReduceSum(HandlerArgs& args) { return true; } -constexpr HandlerInfo reduce_sum_handler = {&FirstInput, &HandleReduceSum}; +constexpr HandlerInfo reduce_op_handler = {&FirstInput, &HandleReduceOps}; static bool HandleSqueeze(HandlerArgs& args) { std::vector new_axes; @@ -1709,7 +1709,7 @@ static const std::unordered_map handler_ma #if !defined(USE_CUDA) && !defined(USE_ROCM) {"Resize", resize_handler}, #endif - {"ReduceSum", reduce_sum_handler}, + {"ReduceSum", reduce_op_handler}, {"ReduceLogSum", reduce_op_handler}, {"ReduceLogSumExp", reduce_op_handler}, diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc index 7ea165c611cb9..3b0edaa559ce9 100644 --- a/onnxruntime/core/providers/cuda/math/topk.cc +++ b/onnxruntime/core/providers/cuda/math/topk.cc @@ -12,7 +12,12 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX( kOnnxDomain, 1, 9, kCudaExecutionProvider, - (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + (*KernelDefBuilder::Create()) + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), TopK); ONNX_OPERATOR_VERSIONED_KERNEL_EX( @@ -20,7 +25,14 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX( kOnnxDomain, 10, 10, kCudaExecutionProvider, - (*KernelDefBuilder::Create()).InputMemoryType(OrtMemTypeCPUInput, 1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()).TypeConstraint("I", DataTypeImpl::GetTensorType()), + (*KernelDefBuilder::Create()) + .InputMemoryType(OrtMemTypeCPUInput, 1) + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}) + .TypeConstraint("I", DataTypeImpl::GetTensorType()), TopK); ONNX_OPERATOR_KERNEL_EX( @@ -28,7 +40,14 @@ ONNX_OPERATOR_KERNEL_EX( kOnnxDomain, 11, kCudaExecutionProvider, - (*KernelDefBuilder::Create()).InputMemoryType(OrtMemTypeCPUInput, 1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()).TypeConstraint("I", DataTypeImpl::GetTensorType()), + (*KernelDefBuilder::Create()) + .InputMemoryType(OrtMemTypeCPUInput, 1) + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}) + .TypeConstraint("I", DataTypeImpl::GetTensorType()), TopK); template @@ -42,11 +61,11 @@ TopK::TopK(const OpKernelInfo& info) : CudaKernel(info) { } #define IS_PRIM_TYPE(T) utils::IsPrimitiveDataType(prim_type) -#define TOPKIMPL(T) TopKImpl(this, ctx->GetComputeStream(), tensor_X->Data(), \ - static_cast(tensor_V->MutableDataRaw()), \ - static_cast(tensor_I->MutableDataRaw()), \ - elem_nums_cuda, \ - elem_nums.size(), \ +#define TOPKIMPL(T) TopKImpl(this, ctx->GetComputeStream(), tensor_X->Data(), \ + static_cast(tensor_V->MutableDataRaw()), \ + static_cast(tensor_I->MutableDataRaw()), \ + elem_nums_cuda, \ + elem_nums.size(), \ axis, K_, largest_, sorted_, N, dimension) template @@ -87,12 +106,6 @@ Status TopK::ComputeInternal(OpKernelContext* ctx) const { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator"); } - if (IS_PRIM_TYPE(uint8_t)) return TOPKIMPL(uint8_t); - if (IS_PRIM_TYPE(uint16_t)) return TOPKIMPL(uint16_t); - if (IS_PRIM_TYPE(uint32_t)) return TOPKIMPL(uint32_t); - if (IS_PRIM_TYPE(uint64_t)) return TOPKIMPL(uint64_t); - if (IS_PRIM_TYPE(int8_t)) return TOPKIMPL(int8_t); - if (IS_PRIM_TYPE(int16_t)) return TOPKIMPL(int16_t); if (IS_PRIM_TYPE(int32_t)) return TOPKIMPL(int32_t); if (IS_PRIM_TYPE(int64_t)) return TOPKIMPL(int64_t); if (IS_PRIM_TYPE(MLFloat16)) return TOPKIMPL(MLFloat16); diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_i16.cu b/onnxruntime/core/providers/cuda/math/topk_impl_i16.cu deleted file mode 100644 index e194bd1bfd15a..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_i16.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE int16_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_i8.cu b/onnxruntime/core/providers/cuda/math/topk_impl_i8.cu deleted file mode 100644 index db32e9e43392f..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_i8.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE int8_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_u16.cu b/onnxruntime/core/providers/cuda/math/topk_impl_u16.cu deleted file mode 100644 index c9ed54e832e9e..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_u16.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE uint16_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_u32.cu b/onnxruntime/core/providers/cuda/math/topk_impl_u32.cu deleted file mode 100644 index fceb367e7eb03..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_u32.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE uint32_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_u64.cu b/onnxruntime/core/providers/cuda/math/topk_impl_u64.cu deleted file mode 100644 index 1a7b3f2aed878..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_u64.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE uint64_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_u8.cu b/onnxruntime/core/providers/cuda/math/topk_impl_u8.cu deleted file mode 100644 index 7fcd4b81b3bf9..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_u8.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE uint8_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 43ccfa29624d9..79068a0271f28 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -1997,7 +1997,8 @@ Status InferenceSession::Run(const RunOptions& run_options, // info all execution providers InferenceSession:Run ended for (auto* xp : exec_providers_to_stop) { - auto status = xp->OnRunEnd(run_options.synchronize_execution_providers); + bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0"; + auto status = xp->OnRunEnd(synchronize_execution_providers); ORT_CHECK_AND_SET_RETVAL(status); } diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 11f45ca8c9adb..2d62d2e3d70a8 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1343,8 +1343,6 @@ RunOptions instance. The individual calls will exit gracefully and return an err #endif .def_readwrite("only_execute_path_to_fetches", &RunOptions::only_execute_path_to_fetches, R"pbdoc(Only execute the nodes needed by fetch list)pbdoc") - .def_readwrite("synchronize_execution_providers", &RunOptions::synchronize_execution_providers, - R"pbdoc(Synchronize execution providers after executing session.)pbdoc") .def( "add_run_config_entry", [](RunOptions* options, const char* config_key, const char* config_value) -> void { diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py index 689235b630d94..3175b96d322a1 100755 --- a/onnxruntime/python/tools/symbolic_shape_infer.py +++ b/onnxruntime/python/tools/symbolic_shape_infer.py @@ -202,6 +202,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""): "SkipSimplifiedLayerNormalization": self._infer_SkipLayerNormalization, "GroupNorm": self._infer_GroupNorm, "BiasSplitGelu": self._infer_BiasSplitGelu, + "NhwcConv": self._infer_NhwcConv, } self.aten_op_dispatcher_ = { "embedding": self._infer_Gather, @@ -438,6 +439,7 @@ def _onnx_infer_single_node(self, node): "MultiHeadAttention", "GroupNorm", "BiasSplitGelu", + "NhwcConv", ] if not skip_infer: @@ -619,13 +621,13 @@ def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0): def _new_symbolic_shape(self, rank, node, out_idx=0): return [self._new_symbolic_dim_from_output(node, out_idx, i) for i in range(rank)] - def _compute_conv_pool_shape(self, node): + def _compute_conv_pool_shape(self, node, channels_last=False): sympy_shape = self._get_sympy_shape(node, 0) if len(node.input) > 1: W_shape = self._get_sympy_shape(node, 1) rank = len(W_shape) - 2 # number of spatial axes - kernel_shape = W_shape[-rank:] - sympy_shape[1] = W_shape[0] + kernel_shape = W_shape[-rank - 1 : -1] if channels_last else W_shape[-rank:] + sympy_shape[3 if channels_last else 1] = W_shape[0] else: W_shape = None kernel_shape = get_attribute(node, "kernel_shape") @@ -634,13 +636,17 @@ def _compute_conv_pool_shape(self, node): assert len(sympy_shape) == rank + 2 # only need to symbolic shape inference if input has symbolic dims in spatial axes - is_symbolic_dims = [not is_literal(i) for i in sympy_shape[-rank:]] + spatial_shape = sympy_shape[-rank - 1 : -1] if channels_last else sympy_shape[-rank:] + is_symbolic_dims = [not is_literal(i) for i in spatial_shape] if not any(is_symbolic_dims): shape = get_shape_from_value_info(self.known_vi_[node.output[0]]) if len(shape) > 0: assert len(sympy_shape) == len(shape) - sympy_shape[-rank:] = [sympy.Integer(d) for d in shape[-rank:]] + if channels_last: + sympy_shape[-rank - 1 : -1] = [sympy.Integer(d) for d in shape[-rank - 1 : -1]] + else: + sympy_shape[-rank:] = [sympy.Integer(d) for d in shape[-rank:]] return sympy_shape dilations = get_attribute(node, "dilations", [1] * rank) @@ -671,7 +677,7 @@ def _compute_conv_pool_shape(self, node): ceil_mode = get_attribute(node, "ceil_mode", 0) for i in range(rank): - effective_input_size = sympy_shape[-rank + i] + effective_input_size = sympy_shape[-rank + i + (-1 if channels_last else 0)] if len(total_pads) > 0: effective_input_size = effective_input_size + total_pads[i] if ceil_mode: @@ -680,7 +686,7 @@ def _compute_conv_pool_shape(self, node): ) else: strided_kernel_positions = (effective_input_size - effective_kernel_shape[i]) // strides[i] - sympy_shape[-rank + i] = strided_kernel_positions + 1 + sympy_shape[-rank + i + (-1 if channels_last else 0)] = strided_kernel_positions + 1 return sympy_shape def _check_merged_dims(self, dims, allow_broadcast=True): @@ -914,6 +920,18 @@ def _infer_Conv(self, node): ) ) + def _infer_NhwcConv(self, node): + sympy_shape = self._compute_conv_pool_shape(node, channels_last=True) + self._update_computed_dims(sympy_shape) + vi = self.known_vi_[node.output[0]] + vi.CopyFrom( + helper.make_tensor_value_info( + node.output[0], + self.known_vi_[node.input[0]].type.tensor_type.elem_type, + get_shape_from_sympy_shape(sympy_shape), + ) + ) + def _infer_Einsum(self, node): # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275 equation = get_attribute(node, "equation") @@ -2455,6 +2473,7 @@ def infer_shapes(in_mp, int_max=2**31 - 1, auto_merge=False, guess_output_rank=F all_shapes_inferred = symbolic_shape_inference._infer_impl() symbolic_shape_inference._update_output_from_vi() if not all_shapes_inferred: + onnx.save_model(symbolic_shape_inference.out_mp_, "sym_shape_infer_temp.onnx", save_as_external_data=True) raise Exception("Incomplete symbolic shape inference") return symbolic_shape_inference.out_mp_ diff --git a/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py new file mode 100644 index 0000000000000..d8ecb652800f6 --- /dev/null +++ b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py @@ -0,0 +1,90 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +from logging import getLogger +from typing import List + +from fusion_base import Fusion +from onnx import TensorProto, helper, numpy_helper +from onnx_model import OnnxModel + +logger = getLogger(__name__) + + +class FusionNhwcConv(Fusion): + """Convert Conv to NhwcConv""" + + def __init__(self, model: OnnxModel, update_weight=False): + super().__init__(model, "NhwcConv", ["Conv"], "NhwcConv") + self.update_weight = update_weight + + def create_transpose_node(self, input_name: str, perm: List[int], output_name=None): + """Append a Transpose node after an input""" + node_name = self.model.create_node_name("Transpose") + + if output_name is None: + output_name = node_name + "_out" + "-" + input_name + + transpose_node = helper.make_node("Transpose", inputs=[input_name], outputs=[output_name], name=node_name) + transpose_node.attribute.extend([helper.make_attribute("perm", perm)]) + + return transpose_node + + def fuse(self, conv, input_name_to_nodes, output_name_to_node): + # Add Transpose node to convert input from NCHW to NHWC + input_transpose_node = self.create_transpose_node(conv.input[0], [0, 2, 3, 1]) + + nhwc_conv_input = input_transpose_node.output[0] + + # Create a tensor for transposed weights (already in NHWC format). + node_name = self.model.create_node_name("NhwcConv") + + # Make sure the weights is 4D + weight_tensor = self.model.get_initializer(conv.input[1]) + if weight_tensor is None: + return + weight = numpy_helper.to_array(weight_tensor) + if len(weight.shape) != 4: + return + + if self.update_weight: + # Transpose weights from NCHW to NHWC + weight = weight.transpose(0, 2, 3, 1) + + weight_name = node_name + "_weight_NHWC" + nhwc_weight = helper.make_tensor( + name=weight_name, + data_type=TensorProto.FLOAT, + dims=list(weight.shape), + vals=weight.flatten().tolist(), + ) + self.model.add_initializer(nhwc_weight, self.this_graph_name) + weight_transpose_node = None + else: + weight_transpose_node = self.create_transpose_node(conv.input[1], [0, 2, 3, 1]) + weight_name = weight_transpose_node.output[0] + + nhwc_output_name = node_name + "_out" + "-" + conv.output[0] + nhwc_conv = helper.make_node( + "NhwcConv", + inputs=[nhwc_conv_input, weight_name] + conv.input[2:], + outputs=[nhwc_output_name], + name=node_name + "-" + conv.name, + ) + nhwc_conv.attribute.extend(conv.attribute) + nhwc_conv.domain = "com.microsoft" + + output_transpose_node = self.create_transpose_node(nhwc_conv.output[0], [0, 3, 1, 2], conv.output[0]) + + self.nodes_to_remove.append(conv) + + nodes_to_add = [input_transpose_node, nhwc_conv, output_transpose_node] + if weight_transpose_node: + nodes_to_add.append(weight_transpose_node) + for node in nodes_to_add: + self.node_name_to_graph_name[node.name] = self.this_graph_name + self.nodes_to_add.extend(nodes_to_add) + + self.increase_counter("NhwcConv") diff --git a/onnxruntime/python/tools/transformers/fusion_reshape.py b/onnxruntime/python/tools/transformers/fusion_reshape.py index 75caa255b1c24..853038f7460d7 100644 --- a/onnxruntime/python/tools/transformers/fusion_reshape.py +++ b/onnxruntime/python/tools/transformers/fusion_reshape.py @@ -119,16 +119,15 @@ def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node): shape_nodes.extend([path2[-1], path3[-1]]) shape.append(-1) elif len(concat_node.input) > 2: - concat_2 = self.model.get_initializer(concat_node.input[2]) - if concat_2 is None: + concat_value = self.model.get_constant_value(concat_node.input[2]) + if concat_value is None: return - concat_value = numpy_helper.to_array(concat_2) if isinstance(concat_value, np.ndarray): shape.extend(concat_value.tolist()) else: shape.append(concat_value) - if len(concat_node.input) == 4 and self.model.get_initializer(concat_node.input[3]) is None: + if len(concat_node.input) == 4 and self.model.get_constant_value(concat_node.input[3]) is None: if -1 in shape: return diff --git a/onnxruntime/python/tools/transformers/fusion_transpose.py b/onnxruntime/python/tools/transformers/fusion_transpose.py new file mode 100644 index 0000000000000..d92ddd5f8e678 --- /dev/null +++ b/onnxruntime/python/tools/transformers/fusion_transpose.py @@ -0,0 +1,81 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +from logging import getLogger +from typing import Dict, List + +from fusion_base import Fusion +from fusion_utils import FusionUtils +from onnx import NodeProto, helper +from onnx_model import OnnxModel + +logger = getLogger(__name__) + + +class FusionTranspose(Fusion): + def __init__(self, model: OnnxModel): + super().__init__(model, "Transpose", "Transpose") + + def fuse( + self, + transpose_node: NodeProto, + input_name_to_nodes: Dict[str, List[NodeProto]], + output_name_to_node: Dict[str, NodeProto], + ): + """ + Case 1: + (input)-->Transpose(perm=a)-->Transpose(perm=b)--> + After: + (input)-->Transpose(perm=a)--> (this path can be removed if the output is not used anymore) + | + +----->Transpose(perm=a*b)--> + + Case 2 (Cast has only one child): + (input)-->Transpose(perm=a)--> Cast -->Transpose(perm=b)--> + After: + (input)-->Transpose(perm=a)--> (this path can be removed if the output is not used anymore) + | + +----->Cast --> Transpose(perm=a*b)--> + + + """ + transpose_b = transpose_node + if transpose_b.input[0] not in output_name_to_node: + return + + transpose_a = output_name_to_node[transpose_b.input[0]] + if transpose_a.op_type != "Cast": + cast_node = None + else: + cast_node = transpose_a + + cast_children = self.model.get_children(cast_node, input_name_to_nodes) + if cast_children and len(cast_children) > 1: + return + transpose_a = output_name_to_node[cast_node.input[0]] + + if transpose_a.op_type != "Transpose": + return + + permutation = OnnxModel.get_node_attribute(transpose_b, "perm") + assert isinstance(permutation, list) + + parent_permutation = OnnxModel.get_node_attribute(transpose_a, "perm") + assert isinstance(parent_permutation, list) + + assert len(parent_permutation) == len(permutation) + + output_permutation = [] + for j, index in enumerate(permutation): + output_permutation.append(parent_permutation[index]) + + if cast_node is None: + if FusionUtils.skip_parent(self.model, transpose_b, transpose_a, input_name_to_nodes): + self.nodes_to_remove.append(transpose_a) + else: + if FusionUtils.skip_parent(self.model, cast_node, transpose_a, input_name_to_nodes): + self.nodes_to_remove.append(transpose_a) + transpose_b.ClearField("attribute") + transpose_b.attribute.extend([helper.make_attribute("perm", output_permutation)]) diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py index 8363f2674cd40..07fdf490337a4 100644 --- a/onnxruntime/python/tools/transformers/fusion_utils.py +++ b/onnxruntime/python/tools/transformers/fusion_utils.py @@ -73,6 +73,32 @@ def remove_cast_int32(self, input_name: str): self.model.remove_node(node) self.model.replace_input_of_all_nodes(output_name, input_name) + @staticmethod + def skip_parent(model: OnnxModel, node, parent_node, input_name_to_nodes): + """ + Before: + (input)-->parent-->node-->(output) + After: + (input)-->parent--> + | + +----->node-->(output) + + This function returns a flag about whether the parent node can be removed. + Note that this function assumes the node has first input links from parent! + """ + parent_can_be_removed = False + input_name_to_nodes[node.input[0]].remove(node) + # We can remove the first Transpose if its output is not used (linked to graph output or other nodes) anymore. + if len(input_name_to_nodes[node.input[0]]) == 0 and not model.find_graph_output( + node.input[0] + ): # checks main graph output. TODO: deal with subgraph + parent_can_be_removed = True + # self.nodes_to_remove.append(transpose_a) + + input_name_to_nodes[parent_node.input[0]].append(node) + node.input[0] = parent_node.input[0] + return parent_can_be_removed + @staticmethod def check_node_attribute(node, attribute_name: str, expected_value, default_value=None): """Verify that a node has expected value for an attribute. @@ -228,7 +254,10 @@ def remove_useless_reshape_nodes(self): graph_output_names = set(self.model.get_graphs_output_names()) for node in nodes_to_remove: if bool(set(node.output) & graph_output_names): - if not bool(set(node.input) & graph_input_names): + if ( + not bool(set(node.input) & graph_input_names) + and len(self.model.input_name_to_nodes()[node.input[0]]) == 1 # parent has only one child + ): self.model.replace_output_of_all_nodes(node.input[0], node.output[0]) else: continue diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py index 580c5ef4c3cca..9a00dc8684f32 100755 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py @@ -62,7 +62,7 @@ def get_ort_pipeline(model_name: str, directory: str, provider: str, disable_saf return pipe -def get_torch_pipeline(model_name: str, disable_channels_last: bool, disable_safety_checker: bool): +def get_torch_pipeline(model_name: str, disable_safety_checker: bool): from diffusers import StableDiffusionPipeline from torch import channels_last, float16 @@ -70,8 +70,7 @@ def get_torch_pipeline(model_name: str, disable_channels_last: bool, disable_saf model_name, torch_dtype=float16, revision="fp16", use_auth_token=True ).to("cuda") - if not disable_channels_last: - pipe.unet.to(memory_format=channels_last) # in-place operation + pipe.unet.to(memory_format=channels_last) # in-place operation if disable_safety_checker: pipe.safety_checker = None @@ -144,7 +143,7 @@ def run_ort(model_name: str, directory: str, provider: str, batch_size: int, dis run_ort_pipeline(pipe, batch_size, image_filename_prefix) -def run_torch(model_name: str, batch_size: int, disable_channels_last: bool, disable_safety_checker: bool): +def run_torch(model_name: str, batch_size: int, disable_safety_checker: bool): import torch torch.backends.cudnn.enabled = True @@ -154,13 +153,11 @@ def run_torch(model_name: str, batch_size: int, disable_channels_last: bool, dis torch.set_grad_enabled(False) load_start = time.time() - pipe = get_torch_pipeline(model_name, disable_channels_last, disable_safety_checker) + pipe = get_torch_pipeline(model_name, disable_safety_checker) load_end = time.time() print(f"Model loading took {load_end - load_start} seconds") - image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, disable_safety_checker) + ( - "" if disable_channels_last else "_channels_last" - ) + image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, disable_safety_checker) with torch.inference_mode(): run_torch_pipeline(pipe, batch_size, image_filename_prefix) @@ -196,15 +193,6 @@ def parse_arguments(): help="Directory of saved onnx pipeline. It could be output directory of optimize_pipeline.py.", ) - parser.add_argument( - "-c", - "--disable_channels_last", - required=False, - action="store_true", - help="Disable channels last for torch. It will be ignored for onnxruntime engine", - ) - parser.set_defaults(disable_channels_last=False) - parser.add_argument( "--enable_safety_checker", required=False, @@ -237,7 +225,7 @@ def main(): provider = "CUDAExecutionProvider" # TODO: use ["CUDAExecutionProvider", "CPUExecutionProvider"] in diffuers run_ort(sd_model, args.pipeline, provider, args.batch_size, not args.enable_safety_checker) else: - run_torch(sd_model, args.batch_size, args.disable_channels_last, not args.enable_safety_checker) + run_torch(sd_model, args.batch_size, not args.enable_safety_checker) if __name__ == "__main__": diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py index 0979f0d2ddcb5..932be4a19ae6b 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py @@ -11,18 +11,15 @@ # huggingface-cli login # wget https://raw.githubusercontent.com/huggingface/diffusers/v0.12.1/scripts/convert_stable_diffusion_checkpoint_to_onnx.py # python convert_stable_diffusion_checkpoint_to_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path $ONNX_ROOT/stable-diffusion-v1-5-fp32 -# python convert_stable_diffusion_checkpoint_to_onnx.py --model_path stabilityai/stable-diffusion-2-1 --output_path $ONNX_ROOT/stable-diffusion-v2-1-fp32 -# Note that this script might not be compatible with older or newer version of diffusers/transformers. It is because fusion script need change accordingly when onnx graph is changed. +# Note that this script might not be compatible with older or newer version of diffusers. # Then you can use this script to convert them to float16 like the following: # python optimize_pipeline.py -i $ONNX_ROOT/stable-diffusion-v1-5-fp32 -o $ONNX_ROOT/stable-diffusion-v1-5-fp16 --float16 -# python optimize_pipeline.py -i $ONNX_ROOT/stable-diffusion-v2-1-fp32 -o $ONNX_ROOT/stable-diffusion-v2-1-fp16 --float16 # Or -# pip install -U onnxruntime-gpu >= 1.14 # python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i $ONNX_ROOT/stable-diffusion-v1-5-fp32 -o $ONNX_ROOT/stable-diffusion-v1-5-fp16 --float16 -# python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i $ONNX_ROOT/stable-diffusion-v2-1-fp32 -o $ONNX_ROOT/stable-diffusion-v2-1-fp16 --float16 - -# Note that float16 model is for CUDA Execution Provider. It might not run in CPU Execution Provider. +# +# Note that output model is for CUDA Execution Provider. It might not run in CPU Execution Provider. +# Stable diffusion 2.1 model will get black images using float16 Attention. It is a known issue that we are working on. import argparse import logging @@ -40,7 +37,7 @@ logger = logging.getLogger(__name__) -def optimize_stable_diffusion_onnx_pipeline( +def optimize_sd_pipeline( source_dir: Path, target_dir: Path, overwrite: bool, use_external_data_format: bool, float16: bool ): """Optimize onnx models used in stable diffusion onnx pipeline and optionally convert to float16. @@ -66,23 +63,18 @@ def optimize_stable_diffusion_onnx_pipeline( raise RuntimeError(message) continue - num_heads = 0 - hidden_size = 0 - # Graph fusion before fp16 conversion, otherwise they cannot be fused later. # Right now, onnxruntime does not save >2GB model so we use script to optimize unet instead. logger.info(f"optimize {onnx_model_path}...") fusion_options = FusionOptions("unet") - # packed kv requires compute capacity >= 7.5 (like T4, A100, RTX 2060~4090. See https://developer.nvidia.com/cuda-gpus) - # Suggest to disable it if you are using older GPU like V100, RTX 1060/1070/1080, or using float32 model. fusion_options.enable_packed_kv = float16 m = optimize_model( str(onnx_model_path), model_type="unet", - num_heads=num_heads, - hidden_size=hidden_size, + num_heads=0, # will be deduced from graph + hidden_size=0, # will be deduced from graph opt_level=0, optimization_options=fusion_options, use_gpu=False, @@ -211,7 +203,7 @@ def main(): coloredlogs.install(fmt="%(funcName)20s: %(message)s") args = parse_arguments() copy_extra_directory(Path(args.input), Path(args.output), args.overwrite) - optimize_stable_diffusion_onnx_pipeline( + optimize_sd_pipeline( Path(args.input), Path(args.output), args.overwrite, args.use_external_data_format, args.float16 ) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt new file mode 100644 index 0000000000000..45190f2fb9912 --- /dev/null +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt @@ -0,0 +1,15 @@ +# Install the following package in python 3.10 +diffusers==0.12.1 +transformers==4.26.0 +numpy==1.24.1 +accelerate==0.15.0 +onnxruntime-gpu>=1.14 +onnx==1.13.0 +coloredlogs +packaging==23.0 +protobuf==3.20.3 +psutil==5.9.4 +sympy==1.11.1 +#Tested with PyTorch 1.13.1+cu117 (see pytorch.org for more download options). +#--extra-index-url https://download.pytorch.org/whl/cu117 +#torch==1.13.1+cu117 diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py index 96c22b5894c60..42fd4d5909a30 100644 --- a/onnxruntime/python/tools/transformers/onnx_model.py +++ b/onnxruntime/python/tools/transformers/onnx_model.py @@ -128,6 +128,8 @@ def remove_node(self, node): for graph in self.graphs(): if node in graph.node: graph.node.remove(node) + return + logger.warning("Failed to remove node %s", node) # It might be a bug to hit this line. def remove_nodes(self, nodes_to_remove): for node in nodes_to_remove: @@ -182,6 +184,12 @@ def replace_node_output(node, old_output_name, new_output_name): node.output[j] = new_output_name def replace_output_of_all_nodes(self, old_output_name, new_output_name): + # This function shall be used carefully. For example: + # Add --[old_name]--> Cast ---> [new_name] + # | + # +----[old_name]--> Transpose --> + # If we want to remove the Cast node: replace output of Add to new_name is not enough; + # The input of Transpose shall also be updated to new_name. for node in self.model.graph.node: OnnxModel.replace_node_output(node, old_output_name, new_output_name) @@ -553,7 +561,9 @@ def get_data_type(input_or_output_name): graph_output_names = set(self.get_graphs_output_names()) for node in nodes_to_remove: if bool(set(node.output) & graph_output_names): - if not bool(set(node.input) & graph_input_names): + if (not bool(set(node.input) & graph_input_names)) and len( + self.input_name_to_nodes()[node.input[0]] + ) == 1: self.replace_output_of_all_nodes(node.input[0], node.output[0]) else: continue diff --git a/onnxruntime/python/tools/transformers/onnx_model_unet.py b/onnxruntime/python/tools/transformers/onnx_model_unet.py index feba717bd8f6f..32a98149825c3 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_unet.py +++ b/onnxruntime/python/tools/transformers/onnx_model_unet.py @@ -9,8 +9,11 @@ from fusion_attention_unet import FusionAttentionUnet from fusion_biassplitgelu import FusionBiasSplitGelu from fusion_group_norm import FusionGroupNorm +from fusion_nhwc_conv import FusionNhwcConv from fusion_options import FusionOptions +from fusion_transpose import FusionTranspose from onnx import ModelProto +from onnx_model import OnnxModel from onnx_model_bert import BertOnnxModel logger = getLogger(__name__) @@ -30,10 +33,61 @@ def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0): super().__init__(model, num_heads=num_heads, hidden_size=hidden_size) def preprocess(self): - return + self.remove_useless_div() def postprocess(self): + self.merge_sequential_transpose() self.prune_graph() + self.remove_unused_constant() + + def remove_useless_div(self): + """Remove Div by 1""" + div_nodes = [node for node in self.nodes() if node.op_type == "Div"] + + nodes_to_remove = [] + for div in div_nodes: + if self.find_constant_input(div, 1.0) == 1: + nodes_to_remove.append(div) + + for node in nodes_to_remove: + self.replace_input_of_all_nodes(node.output[0], node.input[0]) + + if nodes_to_remove: + self.remove_nodes(nodes_to_remove) + logger.info("Removed %d useless Div (by 1) nodes", len(nodes_to_remove)) + + def convert_conv_to_nhwc(self): + # Do not update weight here since save external data has a bug + conv_to_nhwc_conv = FusionNhwcConv(self, update_weight=False) + conv_to_nhwc_conv.apply() + + def merge_sequential_transpose(self): + fusion_transpose = FusionTranspose(self) + fusion_transpose.apply() + + remove_count = 0 + nodes = self.get_nodes_by_op_type("Transpose") + for node in nodes: + permutation = OnnxModel.get_node_attribute(node, "perm") + assert isinstance(permutation, list) + if permutation != list(range(len(permutation))): + continue + assert not ( + self.find_graph_output(node.output[0]) + or self.find_graph_input(node.input[0]) + or self.find_graph_output(node.input[0]) + ) + + # Let all children nodes skip current Transpose node and link to its parent + # Note that we cannot update parent node output since parent node might have more than one children. + self.replace_input_of_all_nodes(node.output[0], node.input[0]) + + self.remove_node(node) + remove_count += 1 + + total = len(fusion_transpose.nodes_to_remove) + remove_count + if total: + logger.info("Removed %d Transpose nodes", total) def optimize(self, options: Optional[FusionOptions] = None): if (options is not None) and not options.enable_shape_inference: @@ -78,7 +132,7 @@ def optimize(self, options: Optional[FusionOptions] = None): # Remove reshape nodes that having same shape of input and output based on symbolic shape inference. self.utils.remove_useless_reshape_nodes() - self.postprocess() + self.convert_conv_to_nhwc() if (options is None) or options.enable_bias_skip_layer_norm: # Fuse SkipLayerNormalization and Add Bias before it. @@ -87,6 +141,29 @@ def optimize(self, options: Optional[FusionOptions] = None): if options is not None and options.enable_gelu_approximation: self.gelu_approximation() - self.remove_unused_constant() + self.postprocess() logger.info(f"opset version: {self.get_opset_version()}") + + def get_fused_operator_statistics(self): + """ + Returns node count of fused operators. + """ + op_count = {} + ops = [ + "Attention", + "MultiHeadAttention", + "Gelu", + "FastGelu", + "LayerNormalization", + "SkipLayerNormalization", + "BiasSplitGelu", + "GroupNorm", + "NhwcConv", + ] + for op in ops: + nodes = self.get_nodes_by_op_type(op) + op_count[op] = len(nodes) + + logger.info(f"Optimized operators:{op_count}") + return op_count diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index fde8392d943cd..9df487726ed8b 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -95,7 +95,6 @@ namespace onnxruntime { namespace test { #define MODEL_FOLDER ORT_TSTR("testdata/transform/") - TEST_F(GraphTransformationTests, IdentityElimination) { constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "abs-id-max.onnx"; std::shared_ptr model; @@ -4390,11 +4389,12 @@ TEST_F(GraphTransformationTests, ReshapeFusionOpsetTest) { return Status::OK(); }; - const std::vector opsets{11, 12, 13, 14, 15, 15}; + const std::vector opsets{11, 12, 13, 14, 15, 18}; bool shape_test_for_opset15 = false; - for (auto& opset_version : opsets) { + for (auto& opset : opsets) { auto build_test_case = [&](ModelTestBuilder& builder) { + auto opset_version = builder.DomainToVersionMap().find(kOnnxDomain)->second; auto* input_arg0 = builder.MakeInput({{batch_size, seq_lenth, hidden_size}}); auto* input_arg1 = builder.MakeInput({{hidden_size}}); auto* scalar_int_0 = builder.MakeInitializer({}, {0}); @@ -4414,7 +4414,7 @@ TEST_F(GraphTransformationTests, ReshapeFusionOpsetTest) { auto* out = builder.MakeOutput(); builder.AddNode("Add", {input_arg0, input_arg1}, {add_out}); - if (opset_version == 15) { + if (opset_version >= 15) { if (shape_test_for_opset15) { auto& shape_1 = builder.AddNode("Shape", {add_out}, {shape_out}); shape_1.AddAttribute("start", (int64_t)1); @@ -4442,11 +4442,11 @@ TEST_F(GraphTransformationTests, ReshapeFusionOpsetTest) { }; std::unique_ptr transformer = std::make_unique(); - if (opset_version == 15 && shape_test_for_opset15) { - ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset_version, *logger_, std::move(transformer), TransformerLevel::Level1, 1, + if (opset >= 15 && shape_test_for_opset15) { + ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset, *logger_, std::move(transformer), TransformerLevel::Level1, 1, pre_graph_checker, pre_graph_checker)); } else { - ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset_version, *logger_, std::move(transformer), TransformerLevel::Level1, 1, + ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset, *logger_, std::move(transformer), TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker)); } } @@ -4610,13 +4610,24 @@ TEST_F(GraphTransformationTests, LayerNormWithCastFusionTest_5) { auto* cast_out_2 = builder.MakeIntermediate(); auto* mul_out = builder.MakeIntermediate(); auto* add_out_2 = builder.MakeOutput(); + auto opset = builder.DomainToVersionMap().find(kOnnxDomain)->second; + onnxruntime::NodeArg* axes = nullptr; - builder.AddNode("ReduceMean", {data_arg}, {reduce_mean_out_1}).AddAttribute("axes", std::vector{-1}); + if (opset >= 18) { + axes = builder.MakeInitializer({1}, {-1}); + builder.AddNode("ReduceMean", {data_arg, axes}, {reduce_mean_out_1}); + } else { + builder.AddNode("ReduceMean", {data_arg}, {reduce_mean_out_1}).AddAttribute("axes", std::vector{-1}); + } builder.AddNode("Sub", {data_arg, reduce_mean_out_1}, {sub_out}); builder.AddNode("Cast", {sub_out}, {cast_out_1}) .AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); builder.AddNode("Pow", {cast_out_1, pow_initializer}, {pow_out}); - builder.AddNode("ReduceMean", {pow_out}, {reduce_mean_out_2}).AddAttribute("axes", std::vector{-1}); + if (opset >= 18) { + builder.AddNode("ReduceMean", {pow_out, axes}, {reduce_mean_out_2}); + } else { + builder.AddNode("ReduceMean", {pow_out}, {reduce_mean_out_2}).AddAttribute("axes", std::vector{-1}); + } builder.AddNode("Add", {reduce_mean_out_2, add_initializer}, {add_out_1}); builder.AddNode("Sqrt", {add_out_1}, {sqrt_out}); builder.AddNode("Div", {cast_out_1, sqrt_out}, {div_out}); @@ -4652,7 +4663,7 @@ TEST_F(GraphTransformationTests, LayerNormWithCastFusionTest_5) { }; std::unique_ptr transformer = std::make_unique(); - ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer), TransformerLevel::Level1, + ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, {14, 18}, *logger_, std::move(transformer), TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker)); } diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.cc b/onnxruntime/test/optimizer/graph_transform_test_builder.cc index 274b9184e037a..80f17fdda3936 100644 --- a/onnxruntime/test/optimizer/graph_transform_test_builder.cc +++ b/onnxruntime/test/optimizer/graph_transform_test_builder.cc @@ -17,6 +17,31 @@ namespace onnxruntime { namespace test { +void TransformerTester(const std::function& build_test_case, + const std::function& check_transformed_graph, + TransformerLevel baseline_level, + TransformerLevel target_level, + const std::vector& opset_versions, + double per_sample_tolerance, + double relative_per_sample_tolerance, + std::unique_ptr transformer, + const std::function& add_session_options, + const InlinedHashSet& disabled_optimizers) { + ASSERT_TRUE(transformer == nullptr); + for (auto opset_version : opset_versions) { + TransformerTester(build_test_case, + check_transformed_graph, + baseline_level, + target_level, + opset_version, + per_sample_tolerance, + relative_per_sample_tolerance, + nullptr, + add_session_options, + disabled_optimizers); + } +} + void TransformerTester(const std::function& build_test_case, const std::function& check_transformed_graph, TransformerLevel baseline_level, @@ -101,22 +126,36 @@ Status TestGraphTransformer(const std::function& const logging::Logger& logger, std::unique_ptr transformer, TransformerLevel level, unsigned steps, const std::function& pre_graph_checker, const std::function& post_graph_checker) { - // Build the model for this test. - std::unordered_map domain_to_version; - domain_to_version[kOnnxDomain] = opset_version; - domain_to_version[kMSDomain] = 1; - Model model("TransformerTester", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), - domain_to_version, {}, logger); - Graph& graph = model.MainGraph(); - ModelTestBuilder helper(graph); - build_test_case(helper); - helper.SetGraphOutputs(); - ORT_RETURN_IF_ERROR(graph.Resolve()); - ORT_RETURN_IF_ERROR(pre_graph_checker(graph)); + const std::vector opset_versions{opset_version}; + return TestGraphTransformer(build_test_case, opset_versions, logger, std::move(transformer), + level, steps, pre_graph_checker, post_graph_checker); +} + +Status TestGraphTransformer(const std::function& build_test_case, + const std::vector& opset_versions, + const logging::Logger& logger, std::unique_ptr transformer, + TransformerLevel level, unsigned steps, const std::function& pre_graph_checker, + const std::function& post_graph_checker) { onnxruntime::GraphTransformerManager graph_transformation_mgr{steps}; ORT_RETURN_IF_ERROR(graph_transformation_mgr.Register(std::move(transformer), level)); - ORT_RETURN_IF_ERROR(graph_transformation_mgr.ApplyTransformers(graph, level, logger)); - ORT_RETURN_IF_ERROR(post_graph_checker(graph)); + + for (auto opset : opset_versions) { + // Build the model for this test. + std::unordered_map domain_to_version; + domain_to_version[kOnnxDomain] = opset; + domain_to_version[kMSDomain] = 1; + Model model("TransformerTester", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), + domain_to_version, {}, logger); + Graph& graph = model.MainGraph(); + ModelTestBuilder helper(graph); + build_test_case(helper); + helper.SetGraphOutputs(); + ORT_RETURN_IF_ERROR(graph.Resolve()); + ORT_RETURN_IF_ERROR(pre_graph_checker(graph)); + ORT_RETURN_IF_ERROR(graph_transformation_mgr.ApplyTransformers(graph, level, logger)); + ORT_RETURN_IF_ERROR(post_graph_checker(graph)); + } + return Status::OK(); } diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h index 199f86e056bcb..14c73b2b558af 100644 --- a/onnxruntime/test/optimizer/graph_transform_test_builder.h +++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h @@ -50,6 +50,10 @@ class ModelTestBuilder { ModelTestBuilder(Graph& graph) : graph_(graph) { } + const std::unordered_map& DomainToVersionMap() const noexcept { + return graph_.DomainToVersionMap(); + } + template NodeArg* MakeInput(const std::vector& shape, const std::vector& data) { ONNX_NAMESPACE::TypeProto type_proto; @@ -356,6 +360,17 @@ void TransformerTester(const std::function& buil const std::function& add_session_options = {}, const InlinedHashSet& disabled_optimizers = {}); +void TransformerTester(const std::function& build_test_case, + const std::function& check_transformed_graph, + TransformerLevel baseline_level, + TransformerLevel target_level, + const std::vector& opset_versions, + double per_sample_tolerance = 0.0, + double relative_per_sample_tolerance = 0.0, + std::unique_ptr transformer = nullptr, // must be null in this case. + const std::function& add_session_options = {}, + const InlinedHashSet& disabled_optimizers = {}); + /** * @brief Apply a GraphTransformer to a graph, and run graph checkers before and after applying the transformer. * @@ -372,5 +387,23 @@ Status TestGraphTransformer(const std::function& const logging::Logger& logger, std::unique_ptr transformer, TransformerLevel level, unsigned steps, const std::function& pre_graph_checker, const std::function& post_graph_checker); + +/** + * @brief Apply a GraphTransformer to a graph, and run graph checkers before and after applying the transformer. + * + * @param build_test_case The function to build a graph for testing + * @param opset_versions A graph is created and tested for every opset in this set + * @param logger The logger + * @param transformer The GraphTransformer to be applied + * @param level The transformer level on which the transformer will be applied + * @param steps The step count of the GraphTransformerManager + * @param pre_graph_checker The graph checker function before applying the transformer + * @param post_graph_checker The graph checker function after applying the transformer + */ +Status TestGraphTransformer(const std::function& build_test_case, + const std::vector& opset_versions, + const logging::Logger& logger, std::unique_ptr transformer, + TransformerLevel level, unsigned steps, const std::function& pre_graph_checker, + const std::function& post_graph_checker); } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc index cbb4de74bfa15..99e94cff6275d 100644 --- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc +++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc @@ -278,6 +278,9 @@ TEST(NhwcTransformerTests, ConvSplit) { conv_output_arg, .37f, 131); conv_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); Node& split_node = builder.AddNode("Split", {conv_output_arg}, {split_output1_arg, split_output2_arg}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_node.AddAttribute("num_outputs", static_cast(2)); + } split_node.AddAttribute("axis", static_cast(axis)); builder.AddQLinearBinaryNode("QLinearAdd", split_output1_arg, .37f, 131, @@ -302,6 +305,11 @@ TEST(NhwcTransformerTests, ConvSplit) { check_nhwc_graph, TransformerLevel::Level2, TransformerLevel::Level3); + TransformerTester(build_test_case, + check_nhwc_graph, + TransformerLevel::Level2, + TransformerLevel::Level3, + 18); } } @@ -323,6 +331,9 @@ TEST(NhwcTransformerTests, ConvSplitQLinearConcat) { conv_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); Node& split_node = builder.AddNode("Split", {conv_output_arg}, {split_output1_arg, split_output2_arg}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_node.AddAttribute("num_outputs", static_cast(2)); + } split_node.AddAttribute("axis", static_cast(axis)); Node& qlconcat_node = builder.AddQLinearConcatLike( @@ -346,6 +357,11 @@ TEST(NhwcTransformerTests, ConvSplitQLinearConcat) { check_nhwc_graph, TransformerLevel::Level2, TransformerLevel::Level3); + TransformerTester(build_test_case, + check_nhwc_graph, + TransformerLevel::Level2, + TransformerLevel::Level3, + 18); } } diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h index cb19a1e69e8f8..0ba991a4d22e1 100644 --- a/onnxruntime/test/optimizer/qdq_test_utils.h +++ b/onnxruntime/test/optimizer/qdq_test_utils.h @@ -378,6 +378,9 @@ GetQDQTestCaseFn BuildConsolidationTestCase( auto* split_output_3 = builder.MakeIntermediate(); Node& split_node = builder.AddNode("Split", {upper_dq_output}, {split_output_1, split_output_2, split_output_3}); split_node.AddAttribute("axis", axis); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_node.AddAttribute("num_outputs", static_cast(3)); + } // add Q auto* lower_q_output_1 = builder.MakeIntermediate(); @@ -456,6 +459,9 @@ GetQDQTestCaseFn BuildQDQSplitTestCase( auto* split_output_3 = builder.MakeIntermediate(); Node& split_node = builder.AddNode("Split", {dq_output}, {split_output_1, split_output_2, split_output_3}); split_node.AddAttribute("axis", axis); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_node.AddAttribute("num_outputs", static_cast(3)); + } // add Q auto* q_split_output_1 = builder.MakeOutput(); diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index b253273c5bbc2..e2dcc7fac29ca 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -67,6 +67,14 @@ void QDQTransformerConvTests() { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(BuildQDQConvTestCase(input_shape, weights_shape), + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}, {32, 12, 5}); @@ -157,10 +165,13 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) { test_case({1, 12, 37}, {32, 12, 5}, 11); test_case({1, 12, 37}, {32, 12, 5}, 12); + test_case({1, 12, 37}, {32, 12, 5}, 18); test_case({1, 23, 13, 13}, {30, 23, 3, 3}, 11); test_case({1, 23, 13, 13}, {30, 23, 3, 3}, 12); + test_case({1, 23, 13, 13}, {30, 23, 3, 3}, 18); test_case({1, 22, 11, 13, 15}, {30, 22, 5, 3, 3}, 11); test_case({1, 22, 11, 13, 15}, {30, 22, 5, 3, 3}, 12); + test_case({1, 22, 11, 13, 15}, {30, 22, 5, 3, 3}, 18); } TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) { @@ -292,6 +303,14 @@ void QDQTransformerAveragePoolTests() { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(BuildQDQAveragePoolTestCase(input_shape), + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}); @@ -341,6 +360,14 @@ void QDQTransformerGlobalAveragePoolTests() { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(BuildQDQGlobalAveragePoolTestCase(input_shape), + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}); @@ -391,6 +418,14 @@ void QDQTransformerBinaryOpTests(const std::string& op_type) { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(BuildBinaryOpTestCase(input_shape, op_type), + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}); @@ -522,6 +557,14 @@ void QDQTransformerMatMulTests(bool has_output_q) { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 2, 2}, {1, 2, 4}); @@ -677,6 +720,14 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(build_test_case, + check_binary_op_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({2, 2}, {2, 4}); @@ -813,6 +864,14 @@ TEST(QDQTransformerTests, DoubleQDQ) { 12, (scale_1 + scale_3) / 2, 0.01); + TransformerTester( + BuildDoubleQDQTestCases(zp_1, zp_2, zp_3, zp_4, scale_1, scale_2, scale_3, scale_4), + succeed ? expect_succeed : expect_fail, + TransformerLevel::Default, + TransformerLevel::Level1, + 18, + (scale_1 + scale_3) / 2, + 0.01); }; auto test_case_2u8_2s8_failed = [&](uint8_t zp_1, uint8_t zp_2, int8_t zp_3, int8_t zp_4, @@ -870,7 +929,8 @@ TEST(QDQTransformerTests, Split) { TransformerTester(BuildQDQSplitTestCase(input_shape, axis), check_graph, TransformerLevel::Level1, - TransformerLevel::Level2); + TransformerLevel::Level2, + {12, 18}); }; test_case({6, 18, 54}, 0); } @@ -887,7 +947,7 @@ TEST(QDQTransformerTests, Split_without_IdenticalChildrenConsolidation) { TransformerTester(BuildConsolidationTestCase(input_shape, axis), check_graph, TransformerLevel::Level1, - TransformerLevel::Level2, 12, {}, {}, nullptr, {}, + TransformerLevel::Level2, {12, 18}, {}, {}, nullptr, {}, {"IdenticalChildrenConsolidation"}); }; test_case({6, 18, 54}, 0); @@ -904,7 +964,8 @@ TEST(QDQTransformerTests, Split_with_IdenticalChildrenConsolidation) { TransformerTester(BuildConsolidationTestCase(input_shape, axis), check_graph, TransformerLevel::Level1, - TransformerLevel::Level2); + TransformerLevel::Level2, + {12, 18}); }; test_case({6, 18, 54}, 0); } @@ -1509,7 +1570,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8_Fail) { check_graph, TransformerLevel::Level1, TransformerLevel::Level2, - 12 /*opset_version*/, + {12, 18} /*opset_version*/, 0.01f /*per_sample_tolerance*/, 0.01f /*relative_per_sample_tolerance*/); }; @@ -1566,6 +1627,14 @@ void QDQTransformerLeakyReluTests() { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}); @@ -1635,6 +1704,14 @@ void QDQTransformerSigmoidTests() { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}); @@ -1907,7 +1984,7 @@ TEST(QDQTransformerTests, DQForward_MutilpleSteps) { TEST(QDQTransformerTests, Clip) { constexpr float epsilon = std::numeric_limits::epsilon(); - auto test_case = [&](float scale, auto zero_point, int clip_count, int opset_version = 12) { + auto test_case = [&](float scale, auto zero_point, int clip_count, int opset_version) { auto build_test_case = [&](ModelTestBuilder& builder) { auto* input_arg = builder.MakeInput({1, 32, 112, 112}, std::numeric_limits::min(), @@ -1922,7 +1999,9 @@ TEST(QDQTransformerTests, Clip) { auto* clip_output = builder.MakeIntermediate(); constexpr float min = .0f; constexpr float max = 6.0f; - if (opset_version >= 11) { + auto opset = builder.DomainToVersionMap().find(kOnnxDomain)->second; + EXPECT_EQ(opset_version, opset); + if (opset >= 11) { auto* min_initializer = builder.MakeScalarInitializer(min); auto* max_initializer = builder.MakeScalarInitializer(max); builder.AddNode("Clip", {dq_output, min_initializer, max_initializer}, {clip_output}); @@ -1953,18 +2032,21 @@ TEST(QDQTransformerTests, Clip) { epsilon); }; - test_case(.0235294122248888f, static_cast(-128), 0); // [0, 6] - test_case(.02f, static_cast(-128), 0); // [0, 5.1] - test_case(.03f, static_cast(-128), 1); // [0, 7.65] - test_case(.02f, static_cast(127), 1); // [-5.1 , 0] - test_case(.02f, static_cast(0), 1); // [-2.56, 2.54] - test_case(.04f, static_cast(-97), 1); // [-1.24, 8.96] - test_case(.02352941176f, static_cast(0), 0); // [0, 6] - test_case(.02f, static_cast(0), 0); // [0, 5.1] - test_case(.03f, static_cast(0), 1); // [0, 7.65] - test_case(.02f, static_cast(255), 1); // [-5.1, 0] - test_case(.02f, static_cast(128), 1); // [-2.56, 2.54] - test_case(.04f, static_cast(31), 1); // [-1.24, 8.96] + std::vector opsets{12, 18}; + for (auto opset : opsets) { + test_case(.0235294122248888f, static_cast(-128), 0, opset); // [0, 6] + test_case(.02f, static_cast(-128), 0, opset); // [0, 5.1] + test_case(.03f, static_cast(-128), 1, opset); // [0, 7.65] + test_case(.02f, static_cast(127), 1, opset); // [-5.1 , 0] + test_case(.02f, static_cast(0), 1, opset); // [-2.56, 2.54] + test_case(.04f, static_cast(-97), 1, opset); // [-1.24, 8.96] + test_case(.02352941176f, static_cast(0), 0, opset); // [0, 6] + test_case(.02f, static_cast(0), 0, opset); // [0, 5.1] + test_case(.03f, static_cast(0), 1, opset); // [0, 7.65] + test_case(.02f, static_cast(255), 1, opset); // [-5.1, 0] + test_case(.02f, static_cast(128), 1, opset); // [-2.56, 2.54] + test_case(.04f, static_cast(31), 1, opset); // [-1.24, 8.96] + } // opset_version = 10 test_case(.02f, static_cast(-128), 0, 10); // [0, 5.1] @@ -1973,10 +2055,12 @@ TEST(QDQTransformerTests, Clip) { test_case(.03f, static_cast(0), 1, 10); // [0, 7.65] // difference between lower/upper and min/max are within epsilon - test_case(epsilon, static_cast(-127), 0); // [-epsilon, x] (x <= 6 + epsilon) - test_case((6 + epsilon) / 255, static_cast(-128), 0); // [0, 6 + epsilon] - test_case(epsilon, static_cast(1), 0); // [-epsilon, x] (x <= 6 + epsilon) - test_case((6 + epsilon) / 255, static_cast(0), 0); // [0, 6 + epsilon] + for (auto opset : opsets) { + test_case(epsilon, static_cast(-127), 0, opset); // [-epsilon, x] (x <= 6 + epsilon) + test_case((6 + epsilon) / 255, static_cast(-128), 0, opset); // [0, 6 + epsilon] + test_case(epsilon, static_cast(1), 0, opset); // [-epsilon, x] (x <= 6 + epsilon) + test_case((6 + epsilon) / 255, static_cast(0), 0, opset); // [0, 6 + epsilon] + } } TEST(QDQTransformerTests, Concat) { @@ -2536,7 +2620,7 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) { // regression test to validate TransposeOptimizer and QDQ Propagation don't loop // see https://github.com/microsoft/onnxruntime/issues/11605 -TEST(QDQTransformerTests, QDQPropagation_GH11605) { +TEST(QDQTransformerTests, QDQPropagation_GH11605_Opset12) { auto test_case = [&]() { auto build_test_case = [&](ModelTestBuilder& builder) { auto* input_arg = builder.MakeInput({1, 4, 4}, @@ -2585,7 +2669,61 @@ TEST(QDQTransformerTests, QDQPropagation_GH11605) { TransformerTester(build_test_case, check_graph, TransformerLevel::Default, - TransformerLevel::Level2); + TransformerLevel::Level2, + 12); + }; + + test_case(); +} + +TEST(QDQTransformerTests, QDQPropagation_GH11605_Opset13) { + auto test_case = [&]() { + auto build_test_case = [&](ModelTestBuilder& builder) { + auto* input_arg = builder.MakeInput({1, 4, 4}, + std::numeric_limits::min(), + std::numeric_limits::max()); + // add DQ + auto* dq_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, 0.123f, uint8_t(0), dq_output); + + // add Transpose 0, 2, 1 + const std::vector& perms{0, 2, 1}; + auto* transpose_output = builder.MakeIntermediate(); + Node& transpose_node = builder.AddNode("Transpose", {dq_output}, {transpose_output}); + transpose_node.AddAttribute("perm", perms); + + // add Softmax with axis=2 (to block the Transpose moving past it due to the transpose perms) + auto* softmax_output = builder.MakeIntermediate(); + Node& softmax_node = builder.AddNode("Softmax", {transpose_output}, {softmax_output}); + softmax_node.AddAttribute("axis", int64_t(2)); + + // add second Transpose. this is so the check in TransposeOptimizer::ProcessTranspose for outputs leading to + // a Transpose is satisfied, allowing the first Transpose to move past the Q/DQ inserted by QDQ Propagation + Node& transpose_node2 = builder.AddNode("Transpose", {softmax_output}, {builder.MakeOutput()}); + transpose_node2.AddAttribute("perm", perms); + }; + + // check that an edge case where transpose optimization gets blocked is handled gracefully. + // Original: DQ -> Tr -> SoftM -> Tr + // QDQ Prop inserts a Q/DQ pair to create a QDQ node group for the Transpose: DQ -> Tr -> Q -> DQ -> SoftM -> Tr + // Transpose opt phase 1 moves the Tr down until it blocks on the SoftMax: DQ -> Q -> DQ -> Tr -> SoftM -> Tr + // Transpose opt phase 2 flips the Tr to prior to the DQ as it's not part of a QDQ node group at that point, as + // running the transpose on 8-bit data should be cheaper: DQ -> Q -> Tr -> DQ -> SoftM -> Tr + // QDQ cleanup in Level2 removes the unnecessary DQ/Q pair at the start: Tr -> DQ -> SoftM -> Tr + // this is the optimal result as the Transpose is using 8-bit data and we have no surplus Q/DQ pairs + auto check_graph = [&](InferenceSessionWrapper& session) { + std::vector expected_op_types_in_order{ + "DequantizeLinear", + "Softmax"}; + const auto op_types_in_order = GetNodeOpTypesInTopologicalOrder(session.GetGraph()); + EXPECT_EQ(op_types_in_order, expected_op_types_in_order); + }; + + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Default, + TransformerLevel::Level2, + 13); }; test_case(); diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc index 980ac01b9d1f2..1fab4e3502bad 100644 --- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc +++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc @@ -94,6 +94,9 @@ TEST(TransposeOptimizerTests, TestSplit) { transpose_1.AddAttribute("perm", std::vector{1, 2, 0}); auto& split_1 = builder.AddNode("Split", {transpose_1_out_0}, {split_1_out_0, split_1_out_1}); split_1.AddAttribute("axis", (int64_t)1); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_1.AddAttribute("num_outputs", static_cast(2)); + } auto& transpose_2 = builder.AddNode("Transpose", {split_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{2, 0, 1}); auto& transpose_3 = builder.AddNode("Transpose", {split_1_out_1}, {transpose_3_out_0}); @@ -109,7 +112,7 @@ TEST(TransposeOptimizerTests, TestSplit) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSplitDefaultAxis) { @@ -123,7 +126,10 @@ TEST(TransposeOptimizerTests, TestSplitDefaultAxis) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{1, 2, 0}); - builder.AddNode("Split", {transpose_1_out_0}, {split_1_out_0, split_1_out_1}); + auto& split_1 = builder.AddNode("Split", {transpose_1_out_0}, {split_1_out_0, split_1_out_1}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_1.AddAttribute("num_outputs", static_cast(2)); + } auto& transpose_2 = builder.AddNode("Transpose", {split_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{2, 0, 1}); auto& transpose_3 = builder.AddNode("Transpose", {split_1_out_1}, {transpose_3_out_0}); @@ -139,7 +145,7 @@ TEST(TransposeOptimizerTests, TestSplitDefaultAxis) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSplitNegativeAxis) { @@ -155,6 +161,9 @@ TEST(TransposeOptimizerTests, TestSplitNegativeAxis) { transpose_1.AddAttribute("perm", std::vector{1, 2, 0}); auto& split_1 = builder.AddNode("Split", {transpose_1_out_0}, {split_1_out_0, split_1_out_1}); split_1.AddAttribute("axis", (int64_t)1); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_1.AddAttribute("num_outputs", static_cast(2)); + } auto& transpose_2 = builder.AddNode("Transpose", {split_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{2, 0, 1}); auto& transpose_3 = builder.AddNode("Transpose", {split_1_out_1}, {transpose_3_out_0}); @@ -170,7 +179,7 @@ TEST(TransposeOptimizerTests, TestSplitNegativeAxis) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestConcat) { @@ -201,7 +210,7 @@ TEST(TransposeOptimizerTests, TestConcat) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestPad) { @@ -213,10 +222,17 @@ TEST(TransposeOptimizerTests, TestPad) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& pad_1 = builder.AddNode("Pad", {transpose_1_out_0}, {pad_1_out_0}); - pad_1.AddAttribute("mode", "constant"); - pad_1.AddAttribute("value", (float)2.3); - pad_1.AddAttribute("pads", std::vector{1, -2, 3, 4, 5, 6, 7, 8}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* value = builder.MakeInitializer({1}, {(float)2.3}); + auto* pads = builder.MakeInitializer({8}, {1, -2, 3, 4, 5, 6, 7, 8}); + auto& pad_1 = builder.AddNode("Pad", {transpose_1_out_0, pads, value}, {pad_1_out_0}); + pad_1.AddAttribute("mode", "constant"); + } else { + auto& pad_1 = builder.AddNode("Pad", {transpose_1_out_0}, {pad_1_out_0}); + pad_1.AddAttribute("mode", "constant"); + pad_1.AddAttribute("value", (float)2.3); + pad_1.AddAttribute("pads", std::vector{1, -2, 3, 4, 5, 6, 7, 8}); + } auto& transpose_2 = builder.AddNode("Transpose", {pad_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); }; @@ -230,7 +246,7 @@ TEST(TransposeOptimizerTests, TestPad) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 10); + /*opset_version*/ {10, 18}); } TEST(TransposeOptimizerTests, TestPadOpset15) { @@ -259,7 +275,7 @@ TEST(TransposeOptimizerTests, TestPadOpset15) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestPadNonconst) { @@ -291,7 +307,7 @@ TEST(TransposeOptimizerTests, TestPadNonconst) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 11); + /*opset_version*/ {11, 18}); } // The CUDA Resize kernel assumes that the input is NCHW and @@ -312,10 +328,15 @@ TEST(TransposeOptimizerTests, TestResize) { auto* transpose_1_out_0 = builder.MakeIntermediate(); auto* resize_1_out_0 = builder.MakeIntermediate(); auto* transpose_2_out_0 = builder.MakeOutput(); + auto empty_arg = NodeArg("", nullptr); auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - builder.AddNode("Resize", {transpose_1_out_0, const_1}, {resize_1_out_0}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 11) { + builder.AddNode("Resize", {transpose_1_out_0, &empty_arg, const_1}, {resize_1_out_0}); + } else { + builder.AddNode("Resize", {transpose_1_out_0, const_1}, {resize_1_out_0}); + } auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); }; @@ -329,7 +350,7 @@ TEST(TransposeOptimizerTests, TestResize) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 10); + /*opset_version*/ {10, 18}); } TEST(TransposeOptimizerTests, TestResizeOpset11) { @@ -357,7 +378,7 @@ TEST(TransposeOptimizerTests, TestResizeOpset11) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 11); + /*opset_version*/ {11, 18}); } TEST(TransposeOptimizerTests, TestResizeOpset15) { @@ -385,7 +406,7 @@ TEST(TransposeOptimizerTests, TestResizeOpset15) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestResizeSizeRoi) { @@ -415,7 +436,7 @@ TEST(TransposeOptimizerTests, TestResizeSizeRoi) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestResizeRoiScalesZeroRank0) { @@ -448,7 +469,8 @@ TEST(TransposeOptimizerTests, TestResizeRoiScalesZeroRank0) { TransformerTester(build_test_case_1, check_optimized_graph_1, TransformerLevel::Default, - TransformerLevel::Level1); + TransformerLevel::Level1, + {12, 18}); } TEST(TransposeOptimizerTests, TestResizeNonconst) { @@ -477,7 +499,7 @@ TEST(TransposeOptimizerTests, TestResizeNonconst) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 11); + /*opset_version*/ {11, 18}); } TEST(TransposeOptimizerTests, TestResizeNonconstOpset13) { @@ -506,7 +528,7 @@ TEST(TransposeOptimizerTests, TestResizeNonconstOpset13) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 13); + /*opset_version*/ {13, 18}); } #endif @@ -534,7 +556,7 @@ TEST(TransposeOptimizerTests, TestAdd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShape) { @@ -557,7 +579,7 @@ TEST(TransposeOptimizerTests, TestShape) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7); + /*opset_version*/ {7, 18}); } TEST(TransposeOptimizerTests, TestShapeOpset15) { @@ -580,7 +602,7 @@ TEST(TransposeOptimizerTests, TestShapeOpset15) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShapeSliceNoStart) { @@ -604,7 +626,7 @@ TEST(TransposeOptimizerTests, TestShapeSliceNoStart) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShapeSliceNegativeEnd) { @@ -628,7 +650,7 @@ TEST(TransposeOptimizerTests, TestShapeSliceNegativeEnd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShapeSliceNegativeStartNoEnd) { @@ -652,7 +674,7 @@ TEST(TransposeOptimizerTests, TestShapeSliceNegativeStartNoEnd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShapeSliceStartAndEnd) { @@ -677,7 +699,7 @@ TEST(TransposeOptimizerTests, TestShapeSliceStartAndEnd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShapeSliceEmptyResult) { @@ -702,7 +724,7 @@ TEST(TransposeOptimizerTests, TestShapeSliceEmptyResult) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceSumKeepdimsTrue) { @@ -714,9 +736,15 @@ TEST(TransposeOptimizerTests, TestReduceSumKeepdimsTrue) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0}, {reducesum_1_out_0}); - reducesum_1.AddAttribute("axes", std::vector{0, -2}); - reducesum_1.AddAttribute("keepdims", (int64_t)1); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* init = builder.MakeInitializer({2}, {0, -2}); + auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0, init}, {reducesum_1_out_0}); + reducesum_1.AddAttribute("keepdims", (int64_t)1); + } else { + auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0}, {reducesum_1_out_0}); + reducesum_1.AddAttribute("axes", std::vector{0, -2}); + reducesum_1.AddAttribute("keepdims", (int64_t)1); + } auto& transpose_2 = builder.AddNode("Transpose", {reducesum_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); }; @@ -730,7 +758,7 @@ TEST(TransposeOptimizerTests, TestReduceSumKeepdimsTrue) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7, + /*opset_version*/ {7, 18}, /*per_sample_tolerance*/ 1e-07, /*relative_per_sample_tolerance*/ 1e-06); } @@ -756,7 +784,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrue) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7, + /*opset_version*/ {7, 18}, /*per_sample_tolerance*/ 1e-07, /*relative_per_sample_tolerance*/ 1e-06); } @@ -770,9 +798,15 @@ TEST(TransposeOptimizerTests, TestReduceSumKeepdimsFalse) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0}, {reducesum_1_out_0}); - reducesum_1.AddAttribute("axes", std::vector{0, -2}); - reducesum_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* init = builder.MakeInitializer({2}, {0, -2}); + auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0, init}, {reducesum_1_out_0}); + reducesum_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0}, {reducesum_1_out_0}); + reducesum_1.AddAttribute("axes", std::vector{0, -2}); + reducesum_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducesum_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -786,7 +820,7 @@ TEST(TransposeOptimizerTests, TestReduceSumKeepdimsFalse) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7, + /*opset_version*/ {7, 18}, /*per_sample_tolerance*/ 1e-07, /*relative_per_sample_tolerance*/ 1e-06); } @@ -812,7 +846,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalse) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7, + /*opset_version*/ {7, 18}, /*per_sample_tolerance*/ 1e-07, /*relative_per_sample_tolerance*/ 1e-06); } @@ -874,7 +908,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueOpset15) { /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyTrue) { +TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyTrueOpset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* const_1 = builder.MakeInitializer({0}, {}); @@ -905,7 +939,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyTrue) { /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyFalse) { +TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyFalseOpset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* const_1 = builder.MakeInitializer({0}, {}); @@ -933,7 +967,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyFalse) /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumNoAxesInput) { +TEST(TransposeOptimizerTests, TestReduceSumNoAxesInputOpset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* transpose_1_out_0 = builder.MakeIntermediate(); @@ -1017,7 +1051,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseOpset15) { /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyTrue) { +TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyTrueOpset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* const_1 = builder.MakeInitializer({0}, {}); @@ -1048,7 +1082,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyTrue) /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyFalse) { +TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyFalseOpset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* const_1 = builder.MakeInitializer({0}, {}); @@ -1076,7 +1110,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyFalse) /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumNoAxesInput_2) { +TEST(TransposeOptimizerTests, TestReduceSumNoAxesInput_2Opset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* transpose_1_out_0 = builder.MakeIntermediate(); @@ -1103,7 +1137,7 @@ TEST(TransposeOptimizerTests, TestReduceSumNoAxesInput_2) { /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumNonconstKeepdimsTrueNoOpt) { +TEST(TransposeOptimizerTests, TestReduceSumNonconstKeepdimsTrueNoOptOpset13) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* input1_arg = MakeInput(builder, {std::vector{}}, std::vector{}, {-1}); @@ -1130,7 +1164,7 @@ TEST(TransposeOptimizerTests, TestReduceSumNonconstKeepdimsTrueNoOpt) { /*opset_version*/ 13); } -TEST(TransposeOptimizerTests, TestReduceSumNonconstKeepdimsFalseNoOpt) { +TEST(TransposeOptimizerTests, TestReduceSumNonconstKeepdimsFalseNoOptOpset13) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* input1_arg = MakeInput(builder, {std::vector{}}, std::vector{}, {-1}); @@ -1166,9 +1200,15 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsTrue) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); - reducemax_1.AddAttribute("axes", std::vector{0, -2}); - reducemax_1.AddAttribute("keepdims", (int64_t)1); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0, axes}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("keepdims", (int64_t)1); + } else { + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("axes", std::vector{0, -2}); + reducemax_1.AddAttribute("keepdims", (int64_t)1); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemax_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); }; @@ -1182,7 +1222,7 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsTrue) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsTrueDefaultAxes) { @@ -1206,7 +1246,7 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsTrueDefaultAxes) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsFalse) { @@ -1218,13 +1258,19 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsFalse) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); - reducemax_1.AddAttribute("axes", std::vector{0, -2}); - reducemax_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0, axes}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("axes", std::vector{0, -2}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemax_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; - + auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) { int transpose_cost = EstimateTransposeCost(session.GetGraph()); EXPECT_EQ(transpose_cost, 0); @@ -1234,7 +1280,7 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsFalse) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsFalseDefaultAxes) { @@ -1258,7 +1304,7 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsFalseDefaultAxes) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceMax) { @@ -1270,8 +1316,13 @@ TEST(TransposeOptimizerTests, TestReduceMax) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); - reducemax_1.AddAttribute("axes", std::vector{0, -2}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + builder.AddNode("ReduceMax", {transpose_1_out_0, axes}, {reducemax_1_out_0}); + } else { + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("axes", std::vector{0, -2}); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemax_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); }; @@ -1285,7 +1336,7 @@ TEST(TransposeOptimizerTests, TestReduceMax) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceMaxDefaultAxes) { @@ -1308,7 +1359,7 @@ TEST(TransposeOptimizerTests, TestReduceMaxDefaultAxes) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSum) { @@ -1320,9 +1371,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSum) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducelogsum_1 = builder.AddNode("ReduceLogSum", {transpose_1_out_0}, {reducelogsum_1_out_0}); - reducelogsum_1.AddAttribute("axes", std::vector{0, -2}); - reducelogsum_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducelogsum_1 = builder.AddNode("ReduceLogSum", {transpose_1_out_0, axes}, {reducelogsum_1_out_0}); + reducelogsum_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducelogsum_1 = builder.AddNode("ReduceLogSum", {transpose_1_out_0}, {reducelogsum_1_out_0}); + reducelogsum_1.AddAttribute("axes", std::vector{0, -2}); + reducelogsum_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducelogsum_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1336,7 +1393,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSum) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSumExp) { @@ -1348,9 +1405,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSumExp) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducelogsumexp_1 = builder.AddNode("ReduceLogSumExp", {transpose_1_out_0}, {reducelogsumexp_1_out_0}); - reducelogsumexp_1.AddAttribute("axes", std::vector{0, -2}); - reducelogsumexp_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducelogsumexp_1 = builder.AddNode("ReduceLogSumExp", {transpose_1_out_0, axes}, {reducelogsumexp_1_out_0}); + reducelogsumexp_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducelogsumexp_1 = builder.AddNode("ReduceLogSumExp", {transpose_1_out_0}, {reducelogsumexp_1_out_0}); + reducelogsumexp_1.AddAttribute("axes", std::vector{0, -2}); + reducelogsumexp_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducelogsumexp_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1364,7 +1427,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSumExp) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceMax) { @@ -1376,9 +1439,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMax) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); - reducemax_1.AddAttribute("axes", std::vector{0, -2}); - reducemax_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0, axes}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("axes", std::vector{0, -2}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemax_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1392,7 +1461,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMax) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceMean) { @@ -1404,9 +1473,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMean) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemean_1 = builder.AddNode("ReduceMean", {transpose_1_out_0}, {reducemean_1_out_0}); - reducemean_1.AddAttribute("axes", std::vector{0, -2}); - reducemean_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducemean_1 = builder.AddNode("ReduceMean", {transpose_1_out_0, axes}, {reducemean_1_out_0}); + reducemean_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducemean_1 = builder.AddNode("ReduceMean", {transpose_1_out_0}, {reducemean_1_out_0}); + reducemean_1.AddAttribute("axes", std::vector{0, -2}); + reducemean_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemean_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1420,7 +1495,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMean) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceMin) { @@ -1432,9 +1507,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMin) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemin_1 = builder.AddNode("ReduceMin", {transpose_1_out_0}, {reducemin_1_out_0}); - reducemin_1.AddAttribute("axes", std::vector{0, -2}); - reducemin_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducemin_1 = builder.AddNode("ReduceMin", {transpose_1_out_0, axes}, {reducemin_1_out_0}); + reducemin_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducemin_1 = builder.AddNode("ReduceMin", {transpose_1_out_0}, {reducemin_1_out_0}); + reducemin_1.AddAttribute("axes", std::vector{0, -2}); + reducemin_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemin_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1448,7 +1529,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMin) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceProd) { @@ -1460,9 +1541,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceProd) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reduceprod_1 = builder.AddNode("ReduceProd", {transpose_1_out_0}, {reduceprod_1_out_0}); - reduceprod_1.AddAttribute("axes", std::vector{0, -2}); - reduceprod_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reduceprod_1 = builder.AddNode("ReduceProd", {transpose_1_out_0, axes}, {reduceprod_1_out_0}); + reduceprod_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reduceprod_1 = builder.AddNode("ReduceProd", {transpose_1_out_0}, {reduceprod_1_out_0}); + reduceprod_1.AddAttribute("axes", std::vector{0, -2}); + reduceprod_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reduceprod_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1476,7 +1563,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceProd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceSumSquare) { @@ -1488,9 +1575,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceSumSquare) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducesumsquare_1 = builder.AddNode("ReduceSumSquare", {transpose_1_out_0}, {reducesumsquare_1_out_0}); - reducesumsquare_1.AddAttribute("axes", std::vector{0, -2}); - reducesumsquare_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* init = builder.MakeInitializer({2}, {0, -2}); + auto& reducesumsquare_1 = builder.AddNode("ReduceSumSquare", {transpose_1_out_0, init}, {reducesumsquare_1_out_0}); + reducesumsquare_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducesumsquare_1 = builder.AddNode("ReduceSumSquare", {transpose_1_out_0}, {reducesumsquare_1_out_0}); + reducesumsquare_1.AddAttribute("axes", std::vector{0, -2}); + reducesumsquare_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducesumsquare_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1504,7 +1597,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceSumSquare) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceL1) { @@ -1516,9 +1609,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceL1) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducel1_1 = builder.AddNode("ReduceL1", {transpose_1_out_0}, {reducel1_1_out_0}); - reducel1_1.AddAttribute("axes", std::vector{0, -2}); - reducel1_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducel1_1 = builder.AddNode("ReduceL1", {transpose_1_out_0, axes}, {reducel1_1_out_0}); + reducel1_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducel1_1 = builder.AddNode("ReduceL1", {transpose_1_out_0}, {reducel1_1_out_0}); + reducel1_1.AddAttribute("axes", std::vector{0, -2}); + reducel1_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducel1_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1532,7 +1631,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceL1) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceL2) { @@ -1544,9 +1643,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceL2) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducel2_1 = builder.AddNode("ReduceL2", {transpose_1_out_0}, {reducel2_1_out_0}); - reducel2_1.AddAttribute("axes", std::vector{0, -2}); - reducel2_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducel2_1 = builder.AddNode("ReduceL2", {transpose_1_out_0, axes}, {reducel2_1_out_0}); + reducel2_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducel2_1 = builder.AddNode("ReduceL2", {transpose_1_out_0}, {reducel2_1_out_0}); + reducel2_1.AddAttribute("axes", std::vector{0, -2}); + reducel2_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducel2_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1560,10 +1665,10 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceL2) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } -TEST(TransposeOptimizerTests, TestSqueeze) { +TEST(TransposeOptimizerTests, TestSqueezeOpset7) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{1, -1, 1, 2}}, {1, 4, 1, 2}, 0.0, 1.0); auto* transpose_1_out_0 = builder.MakeIntermediate(); @@ -1663,7 +1768,7 @@ TEST(TransposeOptimizerTests, TestSqueezeEmptyNoOpt) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7); + /*opset_version*/ {7, 18}); } TEST(TransposeOptimizerTests, TestSqueezeEmptyNoOptOpset15) { @@ -1708,10 +1813,10 @@ TEST(TransposeOptimizerTests, TestSqueezeNonconstNoOpt) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } -TEST(TransposeOptimizerTests, TestUnsqueeze) { +TEST(TransposeOptimizerTests, TestUnsqueezeOpset7) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{2, -1, 6, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* transpose_1_out_0 = builder.MakeIntermediate(); @@ -1901,7 +2006,7 @@ TEST(TransposeOptimizerTests, TestSliceOpset15) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceNoAxesOpset15) { @@ -1929,7 +2034,7 @@ TEST(TransposeOptimizerTests, TestSliceNoAxesOpset15) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceNegativeAxesInt32) { @@ -1958,7 +2063,7 @@ TEST(TransposeOptimizerTests, TestSliceNegativeAxesInt32) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceStepsInt32) { @@ -1988,7 +2093,7 @@ TEST(TransposeOptimizerTests, TestSliceStepsInt32) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceNegativeAxes) { @@ -2017,7 +2122,7 @@ TEST(TransposeOptimizerTests, TestSliceNegativeAxes) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceSteps) { @@ -2047,7 +2152,7 @@ TEST(TransposeOptimizerTests, TestSliceSteps) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceNonconstNoOpt) { @@ -2075,7 +2180,7 @@ TEST(TransposeOptimizerTests, TestSliceNonconstNoOpt) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceNonconstInt32NoOpt) { @@ -2103,7 +2208,7 @@ TEST(TransposeOptimizerTests, TestSliceNonconstInt32NoOpt) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStarts) { @@ -2131,7 +2236,7 @@ TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStarts) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsUnknownLengthNoOpt) { @@ -2158,7 +2263,7 @@ TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsUnknownLengthNoO check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsInt32) { @@ -2186,7 +2291,7 @@ TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsInt32) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsUnknownLengthInt32NoOpt) { @@ -2213,7 +2318,7 @@ TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsUnknownLengthInt check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestTile) { @@ -2240,7 +2345,7 @@ TEST(TransposeOptimizerTests, TestTile) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestTileNonconstReps) { @@ -2267,7 +2372,7 @@ TEST(TransposeOptimizerTests, TestTileNonconstReps) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMinNoAxisKeepdimsTrue) { @@ -2294,7 +2399,7 @@ TEST(TransposeOptimizerTests, TestArgMinNoAxisKeepdimsTrue) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMinNoAxisKeepdimsFalse) { @@ -2321,7 +2426,7 @@ TEST(TransposeOptimizerTests, TestArgMinNoAxisKeepdimsFalse) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMinNoAxis) { @@ -2347,7 +2452,7 @@ TEST(TransposeOptimizerTests, TestArgMinNoAxis) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMinKeepdimsTrue) { @@ -2375,7 +2480,7 @@ TEST(TransposeOptimizerTests, TestArgMinKeepdimsTrue) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMinKeepdimsFalse) { @@ -2403,7 +2508,7 @@ TEST(TransposeOptimizerTests, TestArgMinKeepdimsFalse) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMin) { @@ -2430,7 +2535,7 @@ TEST(TransposeOptimizerTests, TestArgMin) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMax) { @@ -2458,7 +2563,7 @@ TEST(TransposeOptimizerTests, TestArgMax) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSoftmax) { @@ -2771,7 +2876,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsAdd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsMul) { @@ -2801,7 +2906,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsMul) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsSub) { @@ -2831,7 +2936,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsSub) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsDiv) { @@ -2861,7 +2966,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsDiv) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsPRelu) { @@ -2891,7 +2996,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsPRelu) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsGreater) { @@ -2921,7 +3026,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsGreater) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsLess) { @@ -2951,7 +3056,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsLess) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsPow) { @@ -2981,7 +3086,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsPow) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsMax) { @@ -3011,7 +3116,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsMax) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsMin) { @@ -3041,7 +3146,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsMin) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsMean) { @@ -3071,7 +3176,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsMean) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsSum) { @@ -3101,7 +3206,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsSum) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsGreaterOrEqual) { @@ -3131,7 +3236,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsGreaterOrEqual) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsLessOrEqual) { @@ -3161,7 +3266,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsLessOrEqual) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsEqual) { @@ -3191,7 +3296,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsEqual) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsAnd) { @@ -3221,7 +3326,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsAnd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsOr) { @@ -3251,7 +3356,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsOr) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsXor) { @@ -3281,7 +3386,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsXor) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsMod) { @@ -3312,7 +3417,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsMod) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsBitShift) { @@ -3343,7 +3448,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsBitShift) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestWhere) { @@ -3374,7 +3479,7 @@ TEST(TransposeOptimizerTests, TestWhere) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestQuantizeLinearScalar) { @@ -3402,7 +3507,7 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearScalar) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestQuantizeLinearScalarIgnoreAxis) { @@ -3431,7 +3536,7 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearScalarIgnoreAxis) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestQuantizeLinearVector) { @@ -3460,7 +3565,7 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearVector) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestQuantizeLinearVectorUnknownRank) { @@ -3489,7 +3594,7 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearVectorUnknownRank) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestQuantizeLinearScalarOpset10) { @@ -3546,7 +3651,7 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearScalarIgnoreAxis) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestDequantizeLinearVector) { @@ -3575,7 +3680,7 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearVector) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestDequantizeLinearNoAxis) { @@ -3665,7 +3770,7 @@ TEST(TransposeOptimizerTests, TestCast) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastReusedInputs) { @@ -3696,7 +3801,7 @@ TEST(TransposeOptimizerTests, TestBroadcastReusedInputs) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestTransposeGraphOutput) { @@ -3724,7 +3829,7 @@ TEST(TransposeOptimizerTests, TestTransposeGraphOutput) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSimpleReshapeAsTranspose) { @@ -3757,7 +3862,7 @@ TEST(TransposeOptimizerTests, TestSimpleReshapeAsTranspose) { check_optimized_graph, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReshapeAsTransposeGraphOutput) { @@ -3788,7 +3893,7 @@ TEST(TransposeOptimizerTests, TestReshapeAsTransposeGraphOutput) { check_optimized_graph, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestCancelingNodesGraphOutputs) { @@ -3819,7 +3924,7 @@ TEST(TransposeOptimizerTests, TestCancelingNodesGraphOutputs) { check_optimized_graph, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestNonCancelingReshape) { @@ -3855,7 +3960,7 @@ TEST(TransposeOptimizerTests, TestNonCancelingReshape) { check_optimized_graph, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestPushBroadcastUnsqueezeTranspose) { @@ -3890,7 +3995,7 @@ TEST(TransposeOptimizerTests, TestPushBroadcastUnsqueezeTranspose) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestOptimizeTowardsTranspose) { @@ -3920,7 +4025,7 @@ TEST(TransposeOptimizerTests, TestOptimizeTowardsTranspose) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestOnlyOptimizeTowardsTranspose) { @@ -3947,7 +4052,7 @@ TEST(TransposeOptimizerTests, TestOnlyOptimizeTowardsTranspose) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestDontOptimizeWrongInput) { @@ -3973,7 +4078,7 @@ TEST(TransposeOptimizerTests, TestDontOptimizeWrongInput) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestOptimizeBothInputs) { @@ -4001,7 +4106,7 @@ TEST(TransposeOptimizerTests, TestOptimizeBothInputs) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestOmitIdentityTranspose) { @@ -4012,9 +4117,16 @@ TEST(TransposeOptimizerTests, TestOmitIdentityTranspose) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); - reducemax_1.AddAttribute("axes", std::vector{1}); - reducemax_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* init = builder.MakeInitializer({1}, {1}); + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0, init}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } + else { + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("axes", std::vector{1}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } }; auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) { @@ -4027,7 +4139,7 @@ TEST(TransposeOptimizerTests, TestOmitIdentityTranspose) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } // regression test for a model where the transpose optimizations were not completed in a single pass in level 1. diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index 428910849f8db..7a9f09c1851dc 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -131,7 +131,18 @@ "^test_edge_pad_cuda", "^test_reflect_pad_cuda", "^test_softplus_example_expanded_cuda", - "^test_softplus_expanded_cuda" + "^test_softplus_expanded_cuda", + + // TODO: Recover these real model tests from onnx + "^test_vgg19", + "^test_zfnet512", + "^test_bvlc_alexnet", + "^test_densenet121", + "^test_inception_v1", + "^test_inception_v2", + "^test_resnet50", + "^test_shufflenet", + "^test_squeezenet" ], "current_failing_tests_x86": [ "^test_vgg19", diff --git a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py index 3a0c119c22f42..511a0d192018a 100644 --- a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py +++ b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py @@ -392,7 +392,7 @@ def _run_onnx_session_with_ortvaluevector( _nvtx_range_push("run_with_ortvaluevector") run_options = onnxruntime.RunOptions() - run_options.synchronize_execution_providers = True + run_options.add_run_config_entry("disable_synchronize_execution_providers", "1") sess.run_with_ortvaluevector(run_options, input_names, ort_inputs, output_names, ort_outputs, output_devices) _nvtx_range_pop() diff --git a/setup.py b/setup.py index 0c10195dc3b62..294b975a56595 100644 --- a/setup.py +++ b/setup.py @@ -481,9 +481,12 @@ def finalize_options(self): "onnxruntime.quantization.operators", "onnxruntime.quantization.CalTableFlatBuffers", "onnxruntime.transformers", + "onnxruntime.transformers.models.bart", + "onnxruntime.transformers.models.bert", "onnxruntime.transformers.models.gpt2", "onnxruntime.transformers.models.longformer", "onnxruntime.transformers.models.t5", + "onnxruntime.transformers.models.stable_diffusion", ] package_data = {"onnxruntime.tools.mobile_helpers": ["*.md", "*.config"]} diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index baace703ca28a..35ff58ba6f7ee 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.28 + version: 1.0.29 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.28 + version: 1.0.29 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here.