From 6260d59b374622d07798057c85889b189c7e1099 Mon Sep 17 00:00:00 2001
From: Lee James O'Riordan <mlxd@users.noreply.github.com>
Date: Thu, 25 Apr 2024 15:20:35 -0400
Subject: [PATCH] Add compile-time support for AVX2/512 streaming operations in
 LQ (#664)

* Add support for compile-time generation of streaming AVX kernels

* Add streaming and tuning docs

* Auto update version

* Trigger CI

* Update overloads

* Auto update version

* Auto update version

* Trigger CI

* Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>

* Update changelog

* Auto update version

* Trigger CI

* Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>

* Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>

* Auto update version from '0.36.0-dev34' to '0.36.0-dev37'

* Updates from code review

* Auto update version from '0.36.0-dev37' to '0.36.0-dev38'

* Auto update version from '0.36.0-dev38' to '0.36.0-dev39'

* Auto update version from '0.36.0-dev40' to '0.36.0-dev41'

* Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>

* Update doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>

---------

Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>
Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
---
 .github/CHANGELOG.md                          |  5 ++-
 .github/workflows/tests_linux.yml             | 18 ++++++++-
 .../development/avx_kernels/index.rst         |  1 +
 .../development/avx_kernels/kernel_tuning.rst | 13 ++++++
 pennylane_lightning/core/_version.py          |  2 +-
 .../simulators/lightning_qubit/CMakeLists.txt |  8 ++++
 .../cpu_kernels/avx_common/AVX2Concept.hpp    | 39 +++++++++++++++++-
 .../cpu_kernels/avx_common/AVX512Concept.hpp  | 40 ++++++++++++++++++-
 8 files changed, 120 insertions(+), 6 deletions(-)
 create mode 100644 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index e818b7f1eb..2677a7edd8 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### New features since last release
 
+* Add compile-time support for AVX2/512 streaming operations in `lightning.qubit`.
+  [(#664)](https://github.com/PennyLaneAI/pennylane-lightning/pull/664)
+
 * `lightning.kokkos` supports mid-circuit measurements.
   [(#672)](https://github.com/PennyLaneAI/pennylane-lightning/pull/672)
 
@@ -126,7 +129,7 @@
 
 This release contains contributions from (in alphabetical order):
 
-Ali Asadi, Amintor Dusko, Christina Lee, Vincent Michaud-Rioux, Mudit Pandey, Shuli Shu
+Ali Asadi, Amintor Dusko, Christina Lee, Vincent Michaud-Rioux, Lee James O'Riordan, Mudit Pandey, Shuli Shu
 
 ---
 
diff --git a/.github/workflows/tests_linux.yml b/.github/workflows/tests_linux.yml
index 840575d24a..df9ef214a8 100644
--- a/.github/workflows/tests_linux.yml
+++ b/.github/workflows/tests_linux.yml
@@ -71,10 +71,22 @@ jobs:
               -DENABLE_COVERAGE=ON \
               -DLQ_ENABLE_KERNEL_OMP=ON
 
+            cmake . -BBuildKernelAVXStream -G Ninja \
+              -DCMAKE_BUILD_TYPE=Debug \
+              -DBUILD_TESTS=ON \
+              -DENABLE_PYTHON=OFF \
+              -DPL_BACKEND=${{ matrix.pl_backend }} \
+              -DCMAKE_CXX_COMPILER=$(which g++-$GCC_VERSION) \
+              -DENABLE_COVERAGE=ON \
+              -DLQ_ENABLE_KERNEL_AVX_STREAM=ON \
+              -DLQ_ENABLE_KERNEL_OMP=ON
+
+
             cmake --build ./Build
             cmake --build ./BuildKernelOMP
+            cmake --build ./BuildKernelAVXStream
 
-            for d in Build BuildKernelOMP; do
+            for d in Build BuildKernelOMP BuildKernelAVXStream; do
               cd ./$d
               mkdir -p ./tests/results
               for file in *runner ; do ./$file --order lex --reporter junit --out ./tests/results/report_$file.xml; done;
@@ -82,7 +94,8 @@ jobs:
               lcov --remove coverage.info '/usr/*' --output-file coverage.info
               cd ..
             done
-            lcov --add-tracefile ./Build/coverage.info -a ./BuildKernelOMP/coverage.info -o coverage.info
+            lcov  --add-tracefile ./Build/coverage.info -a ./BuildKernelOMP/coverage.info \
+                  --add-tracefile ./BuildKernelAVXStream/coverage.info -o coverage.info
             mv coverage.info coverage-${{ github.job }}-${{ matrix.pl_backend }}.info
 
       - name: Upload test results
@@ -93,6 +106,7 @@ jobs:
           path: |
             ./Build/tests/results/
             ./BuildKernelOMP/tests/results/
+            ./BuildKernelAVXStream/tests/results/
 
           if-no-files-found: error
 
diff --git a/doc/lightning_qubit/development/avx_kernels/index.rst b/doc/lightning_qubit/development/avx_kernels/index.rst
index 0a71829951..32577349da 100644
--- a/doc/lightning_qubit/development/avx_kernels/index.rst
+++ b/doc/lightning_qubit/development/avx_kernels/index.rst
@@ -22,3 +22,4 @@ AVX2/AVX512 kernels
 
    implementation
    build_system
+   kernel_tuning
diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
new file mode 100644
index 0000000000..bc65e33f59
--- /dev/null
+++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
@@ -0,0 +1,13 @@
+Kernel performance tuning
+#########################
+
+Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentiation method implementation and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload.
+
+However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels.
+
+OpenMP threaded kernels
+-----------------------
+
+To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the ``-DLQ_ENABLE_KERNEL_OMP=ON`` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
+
+For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads.
\ No newline at end of file
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 06b4d144a5..fd9437debb 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.36.0-dev40"
+__version__ = "0.36.0-dev41"
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
index 0ce82387c2..f07b94923c 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
@@ -21,6 +21,7 @@ add_library(lightning_qubit STATIC ${LQUBIT_FILES})
 option(ENABLE_BLAS "Enable BLAS" OFF)
 option(ENABLE_GATE_DISPATCHER "Enable gate kernel dispatching on AVX/AVX2/AVX512" ON)
 option(LQ_ENABLE_KERNEL_OMP "Enable OpenMP pragmas for gate kernels" OFF)
+option(LQ_ENABLE_KERNEL_AVX_STREAMING "Enable AVX2/512 streaming operations for gate kernels" OFF)
 
 # Inform the compiler that this device is enabled.
 target_compile_options(lightning_compile_options INTERFACE "-D_ENABLE_PLQUBIT=1")
@@ -51,6 +52,13 @@ if(LQ_ENABLE_KERNEL_OMP)
     add_definitions("-DPL_LQ_KERNEL_OMP")
 endif()
 
+if(LQ_ENABLE_KERNEL_AVX_STREAMING)
+    if(NOT LQ_ENABLE_KERNEL_OMP)
+        message(WARNING "AVX streaming operations require `LQ_ENABLE_KERNEL_OMP` to be enabled.")
+    endif()
+    add_definitions("-DPL_LQ_KERNEL_AVX_STREAMING")
+endif()
+
 target_link_libraries(lightning_qubit PUBLIC    lightning_compile_options
                                                 lightning_external_libs
                                                 lightning_base
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp
index 6488d78ea7..34fcbbe67d 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp
@@ -80,7 +80,7 @@ template <typename T> struct AVX2Concept {
     }
 
     PL_FORCE_INLINE
-    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+    static void store_(std::complex<PrecisionT> *p, IntrinsicType value) {
         if constexpr (std::is_same_v<PrecisionT, float>) {
             _mm256_store_ps(reinterpret_cast<PrecisionT *>(p), value);
         } else if (std::is_same_v<PrecisionT, double>) {
@@ -91,6 +91,43 @@ template <typename T> struct AVX2Concept {
         }
     }
 
+    PL_FORCE_INLINE
+    static void stream_(std::complex<PrecisionT> *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm256_stream_ps(reinterpret_cast<PrecisionT *>(p), value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm256_stream_pd(reinterpret_cast<PrecisionT *>(p), value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
+    PL_FORCE_INLINE
+    static void stream_(PrecisionT *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm256_stream_ps(p, value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm256_stream_pd(p, value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
+    PL_FORCE_INLINE
+    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+        store(reinterpret_cast<PrecisionT *>(p), value);
+    }
+    PL_FORCE_INLINE
+    static void store(PrecisionT *p, IntrinsicType value) {
+#ifdef PL_LQ_KERNEL_AVX_STREAMING
+        store_(p, value);
+#else
+        stream_(p, value);
+#endif
+    }
+
     PL_FORCE_INLINE
     static auto mul(IntrinsicType v0, IntrinsicType v1) {
         if constexpr (std::is_same_v<PrecisionT, float>) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp
index d234e91b4a..4fb2e3a449 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp
@@ -81,7 +81,7 @@ template <typename T> struct AVX512Concept {
     }
 
     PL_FORCE_INLINE
-    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+    static void store_(std::complex<PrecisionT> *p, IntrinsicType value) {
         if constexpr (std::is_same_v<PrecisionT, float>) {
             _mm512_store_ps(p, value);
         } else if (std::is_same_v<PrecisionT, double>) {
@@ -92,6 +92,44 @@ template <typename T> struct AVX512Concept {
         }
     }
 
+    PL_FORCE_INLINE
+    static void stream_(std::complex<PrecisionT> *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm512_stream_ps(p, value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm512_stream_pd(p, value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
+    PL_FORCE_INLINE
+    static void stream_(PrecisionT *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm512_stream_ps(p, value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm512_stream_pd(p, value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
+    PL_FORCE_INLINE
+    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+        store(reinterpret_cast<PrecisionT *>(p), value);
+    }
+
+    PL_FORCE_INLINE
+    static void store(PrecisionT *p, IntrinsicType value) {
+#ifdef PL_LQ_KERNEL_AVX_STREAMING
+        store_(p, value);
+#else
+        stream_(p, value);
+#endif
+    }
+
     PL_FORCE_INLINE
     static auto mul(IntrinsicType v0, IntrinsicType v1) {
         if constexpr (std::is_same_v<PrecisionT, float>) {