From 5269857175209680ab8efc40cf514a0fb685a9f2 Mon Sep 17 00:00:00 2001 From: "Lee J. O'Riordan" Date: Wed, 27 Mar 2024 12:08:07 -0400 Subject: [PATCH 01/21] Add support for compile-time generation of streaming AVX kernels --- .github/workflows/tests_linux.yml | 15 +++++++++++- .../simulators/lightning_qubit/CMakeLists.txt | 5 ++++ .../cpu_kernels/avx_common/AVX2Concept.hpp | 23 ++++++++++++++++++- .../cpu_kernels/avx_common/AVX512Concept.hpp | 23 ++++++++++++++++++- 4 files changed, 63 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests_linux.yml b/.github/workflows/tests_linux.yml index a6373a8111..d4ce49c648 100644 --- a/.github/workflows/tests_linux.yml +++ b/.github/workflows/tests_linux.yml @@ -69,10 +69,22 @@ jobs: -DENABLE_COVERAGE=ON \ -DLQ_ENABLE_KERNEL_OMP=ON + cmake . -BBuildKernelAVXStream -G Ninja \ + -DCMAKE_BUILD_TYPE=Debug \ + -DBUILD_TESTS=ON \ + -DENABLE_PYTHON=OFF \ + -DPL_BACKEND=${{ matrix.pl_backend }} \ + -DCMAKE_CXX_COMPILER=$(which g++-$GCC_VERSION) \ + -DENABLE_COVERAGE=ON \ + -DLQ_ENABLE_KERNEL_AVX_STREAM=ON \ + -DLQ_ENABLE_KERNEL_OMP=ON + + cmake --build ./Build cmake --build ./BuildKernelOMP + cmake --build ./BuildKernelAVXStream - for d in Build BuildKernelOMP; do + for d in Build BuildKernelOMP BuildKernelAVXStream; do cd ./$d mkdir -p ./tests/results for file in *runner ; do ./$file --order lex --reporter junit --out ./tests/results/report_$file.xml; done; @@ -91,6 +103,7 @@ jobs: path: | ./Build/tests/results/ ./BuildKernelOMP/tests/results/ + ./BuildKernelAVXStream/tests/results/ if-no-files-found: error diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt index 0ce82387c2..499057a10b 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt @@ -21,6 +21,7 @@ add_library(lightning_qubit STATIC ${LQUBIT_FILES}) option(ENABLE_BLAS "Enable BLAS" OFF) option(ENABLE_GATE_DISPATCHER "Enable gate kernel dispatching on AVX/AVX2/AVX512" ON) option(LQ_ENABLE_KERNEL_OMP "Enable OpenMP pragmas for gate kernels" OFF) +option(LQ_ENABLE_KERNEL_AVX_STREAMING "Enable AVX2/512 streaming operations for gate kernels" OFF) # Inform the compiler that this device is enabled. target_compile_options(lightning_compile_options INTERFACE "-D_ENABLE_PLQUBIT=1") @@ -51,6 +52,10 @@ if(LQ_ENABLE_KERNEL_OMP) add_definitions("-DPL_LQ_KERNEL_OMP") endif() +if(LQ_ENABLE_KERNEL_AVX_STREAMING) + add_definitions("-DPL_LQ_KERNEL_AVX_STREAMING") +endif() + target_link_libraries(lightning_qubit PUBLIC lightning_compile_options lightning_external_libs lightning_base diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp index 6488d78ea7..e003330958 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp @@ -80,7 +80,7 @@ template struct AVX2Concept { } PL_FORCE_INLINE - static void store(std::complex *p, IntrinsicType value) { + static void store_(std::complex *p, IntrinsicType value) { if constexpr (std::is_same_v) { _mm256_store_ps(reinterpret_cast(p), value); } else if (std::is_same_v) { @@ -91,6 +91,27 @@ template struct AVX2Concept { } } + PL_FORCE_INLINE + static void stream_(std::complex *p, IntrinsicType value) { + if constexpr (std::is_same_v) { + _mm256_stream_ps(reinterpret_cast(p), value); + } else if (std::is_same_v) { + _mm256_stream_pd(reinterpret_cast(p), value); + } else { + static_assert(std::is_same_v || + std::is_same_v); + } + } + + PL_FORCE_INLINE + static void store(std::complex *p, IntrinsicType value) { + #ifdef PL_LQ_KERNEL_AVX_STREAMING + store_(p, value); + #else + stream_(p, value); + #endif + } + PL_FORCE_INLINE static auto mul(IntrinsicType v0, IntrinsicType v1) { if constexpr (std::is_same_v) { diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp index d234e91b4a..672181051d 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp @@ -81,7 +81,7 @@ template struct AVX512Concept { } PL_FORCE_INLINE - static void store(std::complex *p, IntrinsicType value) { + static void store_(std::complex *p, IntrinsicType value) { if constexpr (std::is_same_v) { _mm512_store_ps(p, value); } else if (std::is_same_v) { @@ -92,6 +92,27 @@ template struct AVX512Concept { } } + PL_FORCE_INLINE + static void stream_(std::complex *p, IntrinsicType value) { + if constexpr (std::is_same_v) { + _mm512_stream_ps(p, value); + } else if (std::is_same_v) { + _mm512_stream_pd(p, value); + } else { + static_assert(std::is_same_v || + std::is_same_v); + } + } + + PL_FORCE_INLINE + static void store(std::complex *p, IntrinsicType value) { + #ifdef PL_LQ_KERNEL_AVX_STREAMING + store_(p, value); + #else + stream_(p, value); + #endif + } + PL_FORCE_INLINE static auto mul(IntrinsicType v0, IntrinsicType v1) { if constexpr (std::is_same_v) { From 6bb9a37e790ba3d492be0b833f474501dbc262cc Mon Sep 17 00:00:00 2001 From: "Lee J. O'Riordan" Date: Wed, 27 Mar 2024 12:19:55 -0400 Subject: [PATCH 02/21] Add streaming and tuning docs --- .../development/avx_kernels/index.rst | 1 + .../development/avx_kernels/kernel_tuning.rst | 13 +++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst diff --git a/doc/lightning_qubit/development/avx_kernels/index.rst b/doc/lightning_qubit/development/avx_kernels/index.rst index 0a71829951..32577349da 100644 --- a/doc/lightning_qubit/development/avx_kernels/index.rst +++ b/doc/lightning_qubit/development/avx_kernels/index.rst @@ -22,3 +22,4 @@ AVX2/AVX512 kernels implementation build_system + kernel_tuning diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst new file mode 100644 index 0000000000..b0cf7d99e0 --- /dev/null +++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst @@ -0,0 +1,13 @@ +Kernel performance tuning +######################### + +Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentation method implementation, and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload. + +However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate-kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels. + +OpenMP threaded kernels +----------------------- + +To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=on` CMake flag. Not, that for gradient-workloads with many observables this may reduce performance in comparison with the default mode, so this behaviour is opt-in only. + +For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations through use of the `-DLQ_ENABLE_KERNEL_AVX_STREAMING=on` CMake flag. This forces the data to avoid updating the CPU cache, and can improve performance for larger workloads. \ No newline at end of file From 24f290abf823f7cef0cfeaa836cde5e254d493e4 Mon Sep 17 00:00:00 2001 From: Dev version update bot Date: Wed, 27 Mar 2024 16:20:41 +0000 Subject: [PATCH 03/21] Auto update version --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 0ae493ae06..58da0aa862 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.36.0-dev19" +__version__ = "0.36.0-dev20" From f87bdc1173638ccd6a5dfd28a5509db4c4dcbbd5 Mon Sep 17 00:00:00 2001 From: "Lee J. O'Riordan" Date: Wed, 27 Mar 2024 12:23:54 -0400 Subject: [PATCH 04/21] Trigger CI From 68eb3cf5385a7cdde857efff897ad8f4ce67f678 Mon Sep 17 00:00:00 2001 From: "Lee J. O'Riordan" Date: Wed, 27 Mar 2024 12:39:33 -0400 Subject: [PATCH 05/21] Update overloads --- .../cpu_kernels/avx_common/AVX2Concept.hpp | 22 +++++++++++++++--- .../cpu_kernels/avx_common/AVX512Concept.hpp | 23 ++++++++++++++++--- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp index e003330958..34fcbbe67d 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp @@ -103,13 +103,29 @@ template struct AVX2Concept { } } + PL_FORCE_INLINE + static void stream_(PrecisionT *p, IntrinsicType value) { + if constexpr (std::is_same_v) { + _mm256_stream_ps(p, value); + } else if (std::is_same_v) { + _mm256_stream_pd(p, value); + } else { + static_assert(std::is_same_v || + std::is_same_v); + } + } + PL_FORCE_INLINE static void store(std::complex *p, IntrinsicType value) { - #ifdef PL_LQ_KERNEL_AVX_STREAMING + store(reinterpret_cast(p), value); + } + PL_FORCE_INLINE + static void store(PrecisionT *p, IntrinsicType value) { +#ifdef PL_LQ_KERNEL_AVX_STREAMING store_(p, value); - #else +#else stream_(p, value); - #endif +#endif } PL_FORCE_INLINE diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp index 672181051d..4fb2e3a449 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp @@ -104,13 +104,30 @@ template struct AVX512Concept { } } + PL_FORCE_INLINE + static void stream_(PrecisionT *p, IntrinsicType value) { + if constexpr (std::is_same_v) { + _mm512_stream_ps(p, value); + } else if (std::is_same_v) { + _mm512_stream_pd(p, value); + } else { + static_assert(std::is_same_v || + std::is_same_v); + } + } + PL_FORCE_INLINE static void store(std::complex *p, IntrinsicType value) { - #ifdef PL_LQ_KERNEL_AVX_STREAMING + store(reinterpret_cast(p), value); + } + + PL_FORCE_INLINE + static void store(PrecisionT *p, IntrinsicType value) { +#ifdef PL_LQ_KERNEL_AVX_STREAMING store_(p, value); - #else +#else stream_(p, value); - #endif +#endif } PL_FORCE_INLINE From 61c4073ef14ac5549f3794da76f486182a5f142f Mon Sep 17 00:00:00 2001 From: Dev version update bot Date: Wed, 3 Apr 2024 16:59:50 +0000 Subject: [PATCH 06/21] Auto update version --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 58da0aa862..19067082da 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.36.0-dev20" +__version__ = "0.36.0-dev21" From 9fd7c8f46a7d290e337e67f375f27c9c0650fbf5 Mon Sep 17 00:00:00 2001 From: Dev version update bot Date: Thu, 4 Apr 2024 18:31:38 +0000 Subject: [PATCH 07/21] Auto update version --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 19067082da..86e85a246a 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.36.0-dev21" +__version__ = "0.36.0-dev22" From f2525e4cedf7ab61fa430387d476c585eddf8256 Mon Sep 17 00:00:00 2001 From: "Lee J. O'Riordan" Date: Thu, 4 Apr 2024 14:33:37 -0400 Subject: [PATCH 08/21] Trigger CI From 4ad7ca0f0d24d1431eb9a51afa6a4543aa07a107 Mon Sep 17 00:00:00 2001 From: Lee James O'Riordan Date: Fri, 5 Apr 2024 08:50:10 -0400 Subject: [PATCH 09/21] Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com> --- .../development/avx_kernels/kernel_tuning.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst index b0cf7d99e0..636886c59a 100644 --- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst +++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst @@ -1,13 +1,13 @@ Kernel performance tuning ######################### -Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentation method implementation, and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload. +Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentiation method implementation and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload. However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate-kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels. OpenMP threaded kernels ----------------------- -To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=on` CMake flag. Not, that for gradient-workloads with many observables this may reduce performance in comparison with the default mode, so this behaviour is opt-in only. +To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=on` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only. -For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations through use of the `-DLQ_ENABLE_KERNEL_AVX_STREAMING=on` CMake flag. This forces the data to avoid updating the CPU cache, and can improve performance for larger workloads. \ No newline at end of file +For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the `-DLQ_ENABLE_KERNEL_AVX_STREAMING=on` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads. \ No newline at end of file From ae9809ef02cec9237b45544f9085c814c9871a04 Mon Sep 17 00:00:00 2001 From: "Lee J. O'Riordan" Date: Fri, 19 Apr 2024 16:12:07 -0400 Subject: [PATCH 10/21] Update changelog --- .github/CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index 8b233348a7..b8f7eb9e2a 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -2,6 +2,9 @@ ### New features since last release +* Add compile-time support for AVX2/512 streaming operations in `lightning.qubit`. + [(#664)](https://github.com/PennyLaneAI/pennylane-lightning/pull/664) + * `lightning.kokkos` supports mid-circuit measurements. [(#672)](https://github.com/PennyLaneAI/pennylane-lightning/pull/672) @@ -108,7 +111,7 @@ This release contains contributions from (in alphabetical order): -Ali Asadi, Amintor Dusko, Christina Lee, Vincent Michaud-Rioux, Mudit Pandey, Shuli Shu +Ali Asadi, Amintor Dusko, Christina Lee, Vincent Michaud-Rioux, Lee James O'Riordan, Mudit Pandey, Shuli Shu --- From 7377b2eb3e7eb04945959ce4e7020a651c9394c3 Mon Sep 17 00:00:00 2001 From: Dev version update bot Date: Fri, 19 Apr 2024 20:14:52 +0000 Subject: [PATCH 11/21] Auto update version --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 9c6e90e856..01c5ebb276 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.36.0-dev33" +__version__ = "0.36.0-dev34" From 75e31ca3b38c6b25b01bda9cc355947be0eb8e49 Mon Sep 17 00:00:00 2001 From: "Lee J. O'Riordan" Date: Fri, 19 Apr 2024 16:18:24 -0400 Subject: [PATCH 12/21] Trigger CI From d0aaeece37eee4f446e2e458e6d3f012bdebbfd4 Mon Sep 17 00:00:00 2001 From: Lee James O'Riordan Date: Wed, 24 Apr 2024 09:39:35 -0400 Subject: [PATCH 13/21] Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst Co-authored-by: Vincent Michaud-Rioux --- doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst index 636886c59a..81fcf2ef19 100644 --- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst +++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst @@ -3,7 +3,7 @@ Kernel performance tuning Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentiation method implementation and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload. -However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate-kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels. +However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels. OpenMP threaded kernels ----------------------- From 2b1236ebe07c65e0c6f66e309d7ddfe308a660e5 Mon Sep 17 00:00:00 2001 From: Lee James O'Riordan Date: Wed, 24 Apr 2024 09:39:41 -0400 Subject: [PATCH 14/21] Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst Co-authored-by: Vincent Michaud-Rioux --- doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst index 81fcf2ef19..11f9b1cc40 100644 --- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst +++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst @@ -8,6 +8,6 @@ However, sometimes we may want to modify the above defaults to favour a given wo OpenMP threaded kernels ----------------------- -To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=on` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only. +To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=ON` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only. For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the `-DLQ_ENABLE_KERNEL_AVX_STREAMING=on` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads. \ No newline at end of file From 1b4129cecff2b5551675ac6560dd2e143a598e54 Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Wed, 24 Apr 2024 13:39:56 +0000 Subject: [PATCH 15/21] Auto update version from '0.36.0-dev34' to '0.36.0-dev37' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 01c5ebb276..3e9cf3b0e4 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.36.0-dev34" +__version__ = "0.36.0-dev37" From 22e19820f473b71feb35c94605d4b74e675d37c9 Mon Sep 17 00:00:00 2001 From: "Lee J. O'Riordan" Date: Wed, 24 Apr 2024 10:36:22 -0400 Subject: [PATCH 16/21] Updates from code review --- .github/workflows/tests_linux.yml | 3 ++- .../core/src/simulators/lightning_qubit/CMakeLists.txt | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests_linux.yml b/.github/workflows/tests_linux.yml index 68f126eaf7..df9ef214a8 100644 --- a/.github/workflows/tests_linux.yml +++ b/.github/workflows/tests_linux.yml @@ -94,7 +94,8 @@ jobs: lcov --remove coverage.info '/usr/*' --output-file coverage.info cd .. done - lcov --add-tracefile ./Build/coverage.info -a ./BuildKernelOMP/coverage.info -o coverage.info + lcov --add-tracefile ./Build/coverage.info -a ./BuildKernelOMP/coverage.info \ + --add-tracefile ./BuildKernelAVXStream/coverage.info -o coverage.info mv coverage.info coverage-${{ github.job }}-${{ matrix.pl_backend }}.info - name: Upload test results diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt index 499057a10b..f07b94923c 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt @@ -53,6 +53,9 @@ if(LQ_ENABLE_KERNEL_OMP) endif() if(LQ_ENABLE_KERNEL_AVX_STREAMING) + if(NOT LQ_ENABLE_KERNEL_OMP) + message(WARNING "AVX streaming operations require `LQ_ENABLE_KERNEL_OMP` to be enabled.") + endif() add_definitions("-DPL_LQ_KERNEL_AVX_STREAMING") endif() From 9d437842010afff5b8157b5eee50ceba2558d257 Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Wed, 24 Apr 2024 14:59:03 +0000 Subject: [PATCH 17/21] Auto update version from '0.36.0-dev37' to '0.36.0-dev38' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 3e9cf3b0e4..47c268d60b 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.36.0-dev37" +__version__ = "0.36.0-dev38" From 540052f9938d2dd85106ca9809d026cfc2d6da9f Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Wed, 24 Apr 2024 18:00:28 +0000 Subject: [PATCH 18/21] Auto update version from '0.36.0-dev38' to '0.36.0-dev39' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 47c268d60b..faa706866c 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.36.0-dev38" +__version__ = "0.36.0-dev39" From 4be3c7f53625fef73b762ad10699c1d3e837fec8 Mon Sep 17 00:00:00 2001 From: ringo-but-quantum Date: Thu, 25 Apr 2024 14:37:24 +0000 Subject: [PATCH 19/21] Auto update version from '0.36.0-dev40' to '0.36.0-dev41' --- pennylane_lightning/core/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 06b4d144a5..fd9437debb 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.36.0-dev40" +__version__ = "0.36.0-dev41" From 479c287a7228e7cd94a190ae8e8cad54c0d6e83e Mon Sep 17 00:00:00 2001 From: Lee James O'Riordan Date: Thu, 25 Apr 2024 10:46:31 -0400 Subject: [PATCH 20/21] Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com> --- doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst index 11f9b1cc40..cef428c3cb 100644 --- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst +++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst @@ -10,4 +10,4 @@ OpenMP threaded kernels To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=ON` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only. -For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the `-DLQ_ENABLE_KERNEL_AVX_STREAMING=on` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads. \ No newline at end of file +For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads. \ No newline at end of file From 5b43dbeae63a987952cbe9f252f843098ec2f8a4 Mon Sep 17 00:00:00 2001 From: Lee James O'Riordan Date: Thu, 25 Apr 2024 10:46:39 -0400 Subject: [PATCH 21/21] Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com> --- doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst index cef428c3cb..bc65e33f59 100644 --- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst +++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst @@ -8,6 +8,6 @@ However, sometimes we may want to modify the above defaults to favour a given wo OpenMP threaded kernels ----------------------- -To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=ON` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only. +To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the ``-DLQ_ENABLE_KERNEL_OMP=ON`` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only. For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads. \ No newline at end of file