From 6260d59b374622d07798057c85889b189c7e1099 Mon Sep 17 00:00:00 2001 From: Lee James O'Riordan Date: Thu, 25 Apr 2024 15:20:35 -0400 Subject: [PATCH] Add compile-time support for AVX2/512 streaming operations in LQ (#664) * Add support for compile-time generation of streaming AVX kernels * Add streaming and tuning docs * Auto update version * Trigger CI * Update overloads * Auto update version * Auto update version * Trigger CI * Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com> * Update changelog * Auto update version * Trigger CI * Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst Co-authored-by: Vincent Michaud-Rioux * Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst Co-authored-by: Vincent Michaud-Rioux * Auto update version from '0.36.0-dev34' to '0.36.0-dev37' * Updates from code review * Auto update version from '0.36.0-dev37' to '0.36.0-dev38' * Auto update version from '0.36.0-dev38' to '0.36.0-dev39' * Auto update version from '0.36.0-dev40' to '0.36.0-dev41' * Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com> * Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com> --------- Co-authored-by: Dev version update bot Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com> Co-authored-by: Vincent Michaud-Rioux Co-authored-by: ringo-but-quantum Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com> --- .github/CHANGELOG.md | 5 ++- .github/workflows/tests_linux.yml | 18 ++++++++- .../development/avx_kernels/index.rst | 1 + .../development/avx_kernels/kernel_tuning.rst | 13 ++++++ pennylane_lightning/core/_version.py | 2 +- .../simulators/lightning_qubit/CMakeLists.txt | 8 ++++ .../cpu_kernels/avx_common/AVX2Concept.hpp | 39 +++++++++++++++++- .../cpu_kernels/avx_common/AVX512Concept.hpp | 40 ++++++++++++++++++- 8 files changed, 120 insertions(+), 6 deletions(-) create mode 100644 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index e818b7f1eb..2677a7edd8 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -2,6 +2,9 @@ ### New features since last release +* Add compile-time support for AVX2/512 streaming operations in `lightning.qubit`. + [(#664)](https://github.com/PennyLaneAI/pennylane-lightning/pull/664) + * `lightning.kokkos` supports mid-circuit measurements. [(#672)](https://github.com/PennyLaneAI/pennylane-lightning/pull/672) @@ -126,7 +129,7 @@ This release contains contributions from (in alphabetical order): -Ali Asadi, Amintor Dusko, Christina Lee, Vincent Michaud-Rioux, Mudit Pandey, Shuli Shu +Ali Asadi, Amintor Dusko, Christina Lee, Vincent Michaud-Rioux, Lee James O'Riordan, Mudit Pandey, Shuli Shu --- diff --git a/.github/workflows/tests_linux.yml b/.github/workflows/tests_linux.yml index 840575d24a..df9ef214a8 100644 --- a/.github/workflows/tests_linux.yml +++ b/.github/workflows/tests_linux.yml @@ -71,10 +71,22 @@ jobs: -DENABLE_COVERAGE=ON \ -DLQ_ENABLE_KERNEL_OMP=ON + cmake . -BBuildKernelAVXStream -G Ninja \ + -DCMAKE_BUILD_TYPE=Debug \ + -DBUILD_TESTS=ON \ + -DENABLE_PYTHON=OFF \ + -DPL_BACKEND=${{ matrix.pl_backend }} \ + -DCMAKE_CXX_COMPILER=$(which g++-$GCC_VERSION) \ + -DENABLE_COVERAGE=ON \ + -DLQ_ENABLE_KERNEL_AVX_STREAM=ON \ + -DLQ_ENABLE_KERNEL_OMP=ON + + cmake --build ./Build cmake --build ./BuildKernelOMP + cmake --build ./BuildKernelAVXStream - for d in Build BuildKernelOMP; do + for d in Build BuildKernelOMP BuildKernelAVXStream; do cd ./$d mkdir -p ./tests/results for file in *runner ; do ./$file --order lex --reporter junit --out ./tests/results/report_$file.xml; done; @@ -82,7 +94,8 @@ jobs: lcov --remove coverage.info '/usr/*' --output-file coverage.info cd .. done - lcov --add-tracefile ./Build/coverage.info -a ./BuildKernelOMP/coverage.info -o coverage.info + lcov --add-tracefile ./Build/coverage.info -a ./BuildKernelOMP/coverage.info \ + --add-tracefile ./BuildKernelAVXStream/coverage.info -o coverage.info mv coverage.info coverage-${{ github.job }}-${{ matrix.pl_backend }}.info - name: Upload test results @@ -93,6 +106,7 @@ jobs: path: | ./Build/tests/results/ ./BuildKernelOMP/tests/results/ + ./BuildKernelAVXStream/tests/results/ if-no-files-found: error diff --git a/doc/lightning_qubit/development/avx_kernels/index.rst b/doc/lightning_qubit/development/avx_kernels/index.rst index 0a71829951..32577349da 100644 --- a/doc/lightning_qubit/development/avx_kernels/index.rst +++ b/doc/lightning_qubit/development/avx_kernels/index.rst @@ -22,3 +22,4 @@ AVX2/AVX512 kernels implementation build_system + kernel_tuning diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst new file mode 100644 index 0000000000..bc65e33f59 --- /dev/null +++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst @@ -0,0 +1,13 @@ +Kernel performance tuning +######################### + +Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentiation method implementation and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload. + +However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels. + +OpenMP threaded kernels +----------------------- + +To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the ``-DLQ_ENABLE_KERNEL_OMP=ON`` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only. + +For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads. \ No newline at end of file diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 06b4d144a5..fd9437debb 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.36.0-dev40" +__version__ = "0.36.0-dev41" diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt index 0ce82387c2..f07b94923c 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt @@ -21,6 +21,7 @@ add_library(lightning_qubit STATIC ${LQUBIT_FILES}) option(ENABLE_BLAS "Enable BLAS" OFF) option(ENABLE_GATE_DISPATCHER "Enable gate kernel dispatching on AVX/AVX2/AVX512" ON) option(LQ_ENABLE_KERNEL_OMP "Enable OpenMP pragmas for gate kernels" OFF) +option(LQ_ENABLE_KERNEL_AVX_STREAMING "Enable AVX2/512 streaming operations for gate kernels" OFF) # Inform the compiler that this device is enabled. target_compile_options(lightning_compile_options INTERFACE "-D_ENABLE_PLQUBIT=1") @@ -51,6 +52,13 @@ if(LQ_ENABLE_KERNEL_OMP) add_definitions("-DPL_LQ_KERNEL_OMP") endif() +if(LQ_ENABLE_KERNEL_AVX_STREAMING) + if(NOT LQ_ENABLE_KERNEL_OMP) + message(WARNING "AVX streaming operations require `LQ_ENABLE_KERNEL_OMP` to be enabled.") + endif() + add_definitions("-DPL_LQ_KERNEL_AVX_STREAMING") +endif() + target_link_libraries(lightning_qubit PUBLIC lightning_compile_options lightning_external_libs lightning_base diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp index 6488d78ea7..34fcbbe67d 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp @@ -80,7 +80,7 @@ template struct AVX2Concept { } PL_FORCE_INLINE - static void store(std::complex *p, IntrinsicType value) { + static void store_(std::complex *p, IntrinsicType value) { if constexpr (std::is_same_v) { _mm256_store_ps(reinterpret_cast(p), value); } else if (std::is_same_v) { @@ -91,6 +91,43 @@ template struct AVX2Concept { } } + PL_FORCE_INLINE + static void stream_(std::complex *p, IntrinsicType value) { + if constexpr (std::is_same_v) { + _mm256_stream_ps(reinterpret_cast(p), value); + } else if (std::is_same_v) { + _mm256_stream_pd(reinterpret_cast(p), value); + } else { + static_assert(std::is_same_v || + std::is_same_v); + } + } + + PL_FORCE_INLINE + static void stream_(PrecisionT *p, IntrinsicType value) { + if constexpr (std::is_same_v) { + _mm256_stream_ps(p, value); + } else if (std::is_same_v) { + _mm256_stream_pd(p, value); + } else { + static_assert(std::is_same_v || + std::is_same_v); + } + } + + PL_FORCE_INLINE + static void store(std::complex *p, IntrinsicType value) { + store(reinterpret_cast(p), value); + } + PL_FORCE_INLINE + static void store(PrecisionT *p, IntrinsicType value) { +#ifdef PL_LQ_KERNEL_AVX_STREAMING + store_(p, value); +#else + stream_(p, value); +#endif + } + PL_FORCE_INLINE static auto mul(IntrinsicType v0, IntrinsicType v1) { if constexpr (std::is_same_v) { diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp index d234e91b4a..4fb2e3a449 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp @@ -81,7 +81,7 @@ template struct AVX512Concept { } PL_FORCE_INLINE - static void store(std::complex *p, IntrinsicType value) { + static void store_(std::complex *p, IntrinsicType value) { if constexpr (std::is_same_v) { _mm512_store_ps(p, value); } else if (std::is_same_v) { @@ -92,6 +92,44 @@ template struct AVX512Concept { } } + PL_FORCE_INLINE + static void stream_(std::complex *p, IntrinsicType value) { + if constexpr (std::is_same_v) { + _mm512_stream_ps(p, value); + } else if (std::is_same_v) { + _mm512_stream_pd(p, value); + } else { + static_assert(std::is_same_v || + std::is_same_v); + } + } + + PL_FORCE_INLINE + static void stream_(PrecisionT *p, IntrinsicType value) { + if constexpr (std::is_same_v) { + _mm512_stream_ps(p, value); + } else if (std::is_same_v) { + _mm512_stream_pd(p, value); + } else { + static_assert(std::is_same_v || + std::is_same_v); + } + } + + PL_FORCE_INLINE + static void store(std::complex *p, IntrinsicType value) { + store(reinterpret_cast(p), value); + } + + PL_FORCE_INLINE + static void store(PrecisionT *p, IntrinsicType value) { +#ifdef PL_LQ_KERNEL_AVX_STREAMING + store_(p, value); +#else + stream_(p, value); +#endif + } + PL_FORCE_INLINE static auto mul(IntrinsicType v0, IntrinsicType v1) { if constexpr (std::is_same_v) {