From 5269857175209680ab8efc40cf514a0fb685a9f2 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Wed, 27 Mar 2024 12:08:07 -0400
Subject: [PATCH 01/21] Add support for compile-time generation of streaming
 AVX kernels

---
 .github/workflows/tests_linux.yml             | 15 +++++++++++-
 .../simulators/lightning_qubit/CMakeLists.txt |  5 ++++
 .../cpu_kernels/avx_common/AVX2Concept.hpp    | 23 ++++++++++++++++++-
 .../cpu_kernels/avx_common/AVX512Concept.hpp  | 23 ++++++++++++++++++-
 4 files changed, 63 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/tests_linux.yml b/.github/workflows/tests_linux.yml
index a6373a8111..d4ce49c648 100644
--- a/.github/workflows/tests_linux.yml
+++ b/.github/workflows/tests_linux.yml
@@ -69,10 +69,22 @@ jobs:
               -DENABLE_COVERAGE=ON \
               -DLQ_ENABLE_KERNEL_OMP=ON
 
+            cmake . -BBuildKernelAVXStream -G Ninja \
+              -DCMAKE_BUILD_TYPE=Debug \
+              -DBUILD_TESTS=ON \
+              -DENABLE_PYTHON=OFF \
+              -DPL_BACKEND=${{ matrix.pl_backend }} \
+              -DCMAKE_CXX_COMPILER=$(which g++-$GCC_VERSION) \
+              -DENABLE_COVERAGE=ON \
+              -DLQ_ENABLE_KERNEL_AVX_STREAM=ON \
+              -DLQ_ENABLE_KERNEL_OMP=ON
+
+
             cmake --build ./Build
             cmake --build ./BuildKernelOMP
+            cmake --build ./BuildKernelAVXStream
 
-            for d in Build BuildKernelOMP; do
+            for d in Build BuildKernelOMP BuildKernelAVXStream; do
               cd ./$d
               mkdir -p ./tests/results
               for file in *runner ; do ./$file --order lex --reporter junit --out ./tests/results/report_$file.xml; done;
@@ -91,6 +103,7 @@ jobs:
           path: |
             ./Build/tests/results/
             ./BuildKernelOMP/tests/results/
+            ./BuildKernelAVXStream/tests/results/
 
           if-no-files-found: error
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
index 0ce82387c2..499057a10b 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
@@ -21,6 +21,7 @@ add_library(lightning_qubit STATIC ${LQUBIT_FILES})
 option(ENABLE_BLAS "Enable BLAS" OFF)
 option(ENABLE_GATE_DISPATCHER "Enable gate kernel dispatching on AVX/AVX2/AVX512" ON)
 option(LQ_ENABLE_KERNEL_OMP "Enable OpenMP pragmas for gate kernels" OFF)
+option(LQ_ENABLE_KERNEL_AVX_STREAMING "Enable AVX2/512 streaming operations for gate kernels" OFF)
 
 # Inform the compiler that this device is enabled.
 target_compile_options(lightning_compile_options INTERFACE "-D_ENABLE_PLQUBIT=1")
@@ -51,6 +52,10 @@ if(LQ_ENABLE_KERNEL_OMP)
     add_definitions("-DPL_LQ_KERNEL_OMP")
 endif()
 
+if(LQ_ENABLE_KERNEL_AVX_STREAMING)
+    add_definitions("-DPL_LQ_KERNEL_AVX_STREAMING")
+endif()
+
 target_link_libraries(lightning_qubit PUBLIC    lightning_compile_options
                                                 lightning_external_libs
                                                 lightning_base
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp
index 6488d78ea7..e003330958 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp
@@ -80,7 +80,7 @@ template <typename T> struct AVX2Concept {
     }
 
     PL_FORCE_INLINE
-    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+    static void store_(std::complex<PrecisionT> *p, IntrinsicType value) {
         if constexpr (std::is_same_v<PrecisionT, float>) {
             _mm256_store_ps(reinterpret_cast<PrecisionT *>(p), value);
         } else if (std::is_same_v<PrecisionT, double>) {
@@ -91,6 +91,27 @@ template <typename T> struct AVX2Concept {
         }
     }
 
+    PL_FORCE_INLINE
+    static void stream_(std::complex<PrecisionT> *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm256_stream_ps(reinterpret_cast<PrecisionT *>(p), value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm256_stream_pd(reinterpret_cast<PrecisionT *>(p), value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
+    PL_FORCE_INLINE
+    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+        #ifdef PL_LQ_KERNEL_AVX_STREAMING
+        store_(p, value);
+        #else
+        stream_(p, value);
+        #endif
+    }
+
     PL_FORCE_INLINE
     static auto mul(IntrinsicType v0, IntrinsicType v1) {
         if constexpr (std::is_same_v<PrecisionT, float>) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp
index d234e91b4a..672181051d 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp
@@ -81,7 +81,7 @@ template <typename T> struct AVX512Concept {
     }
 
     PL_FORCE_INLINE
-    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+    static void store_(std::complex<PrecisionT> *p, IntrinsicType value) {
         if constexpr (std::is_same_v<PrecisionT, float>) {
             _mm512_store_ps(p, value);
         } else if (std::is_same_v<PrecisionT, double>) {
@@ -92,6 +92,27 @@ template <typename T> struct AVX512Concept {
         }
     }
 
+    PL_FORCE_INLINE
+    static void stream_(std::complex<PrecisionT> *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm512_stream_ps(p, value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm512_stream_pd(p, value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
+    PL_FORCE_INLINE
+    static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
+        #ifdef PL_LQ_KERNEL_AVX_STREAMING
+        store_(p, value);
+        #else
+        stream_(p, value);
+        #endif
+    }
+
     PL_FORCE_INLINE
     static auto mul(IntrinsicType v0, IntrinsicType v1) {
         if constexpr (std::is_same_v<PrecisionT, float>) {

From 6bb9a37e790ba3d492be0b833f474501dbc262cc Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Wed, 27 Mar 2024 12:19:55 -0400
Subject: [PATCH 02/21] Add streaming and tuning docs

---
 .../development/avx_kernels/index.rst               |  1 +
 .../development/avx_kernels/kernel_tuning.rst       | 13 +++++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

diff --git a/doc/lightning_qubit/development/avx_kernels/index.rst b/doc/lightning_qubit/development/avx_kernels/index.rst
index 0a71829951..32577349da 100644
--- a/doc/lightning_qubit/development/avx_kernels/index.rst
+++ b/doc/lightning_qubit/development/avx_kernels/index.rst
@@ -22,3 +22,4 @@ AVX2/AVX512 kernels
 
    implementation
    build_system
+   kernel_tuning
diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
new file mode 100644
index 0000000000..b0cf7d99e0
--- /dev/null
+++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
@@ -0,0 +1,13 @@
+Kernel performance tuning
+#########################
+
+Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentation method implementation, and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload.
+
+However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate-kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels.
+
+OpenMP threaded kernels
+-----------------------
+
+To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=on` CMake flag. Not, that for gradient-workloads with many observables this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
+
+For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations through use of the `-DLQ_ENABLE_KERNEL_AVX_STREAMING=on` CMake flag. This forces the data to avoid updating the CPU cache, and can improve performance for larger workloads.
\ No newline at end of file

From 24f290abf823f7cef0cfeaa836cde5e254d493e4 Mon Sep 17 00:00:00 2001
From: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Date: Wed, 27 Mar 2024 16:20:41 +0000
Subject: [PATCH 03/21] Auto update version

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 0ae493ae06..58da0aa862 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.36.0-dev19"
+__version__ = "0.36.0-dev20"

From f87bdc1173638ccd6a5dfd28a5509db4c4dcbbd5 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Wed, 27 Mar 2024 12:23:54 -0400
Subject: [PATCH 04/21] Trigger CI


From 68eb3cf5385a7cdde857efff897ad8f4ce67f678 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Wed, 27 Mar 2024 12:39:33 -0400
Subject: [PATCH 05/21] Update overloads

---
 .../cpu_kernels/avx_common/AVX2Concept.hpp    | 22 +++++++++++++++---
 .../cpu_kernels/avx_common/AVX512Concept.hpp  | 23 ++++++++++++++++---
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp
index e003330958..34fcbbe67d 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX2Concept.hpp
@@ -103,13 +103,29 @@ template <typename T> struct AVX2Concept {
         }
     }
 
+    PL_FORCE_INLINE
+    static void stream_(PrecisionT *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm256_stream_ps(p, value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm256_stream_pd(p, value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
     PL_FORCE_INLINE
     static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
-        #ifdef PL_LQ_KERNEL_AVX_STREAMING
+        store(reinterpret_cast<PrecisionT *>(p), value);
+    }
+    PL_FORCE_INLINE
+    static void store(PrecisionT *p, IntrinsicType value) {
+#ifdef PL_LQ_KERNEL_AVX_STREAMING
         store_(p, value);
-        #else
+#else
         stream_(p, value);
-        #endif
+#endif
     }
 
     PL_FORCE_INLINE
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp
index 672181051d..4fb2e3a449 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/gates/cpu_kernels/avx_common/AVX512Concept.hpp
@@ -104,13 +104,30 @@ template <typename T> struct AVX512Concept {
         }
     }
 
+    PL_FORCE_INLINE
+    static void stream_(PrecisionT *p, IntrinsicType value) {
+        if constexpr (std::is_same_v<PrecisionT, float>) {
+            _mm512_stream_ps(p, value);
+        } else if (std::is_same_v<PrecisionT, double>) {
+            _mm512_stream_pd(p, value);
+        } else {
+            static_assert(std::is_same_v<PrecisionT, float> ||
+                          std::is_same_v<PrecisionT, double>);
+        }
+    }
+
     PL_FORCE_INLINE
     static void store(std::complex<PrecisionT> *p, IntrinsicType value) {
-        #ifdef PL_LQ_KERNEL_AVX_STREAMING
+        store(reinterpret_cast<PrecisionT *>(p), value);
+    }
+
+    PL_FORCE_INLINE
+    static void store(PrecisionT *p, IntrinsicType value) {
+#ifdef PL_LQ_KERNEL_AVX_STREAMING
         store_(p, value);
-        #else
+#else
         stream_(p, value);
-        #endif
+#endif
     }
 
     PL_FORCE_INLINE

From 61c4073ef14ac5549f3794da76f486182a5f142f Mon Sep 17 00:00:00 2001
From: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Date: Wed, 3 Apr 2024 16:59:50 +0000
Subject: [PATCH 06/21] Auto update version

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 58da0aa862..19067082da 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.36.0-dev20"
+__version__ = "0.36.0-dev21"

From 9fd7c8f46a7d290e337e67f375f27c9c0650fbf5 Mon Sep 17 00:00:00 2001
From: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Date: Thu, 4 Apr 2024 18:31:38 +0000
Subject: [PATCH 07/21] Auto update version

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 19067082da..86e85a246a 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.36.0-dev21"
+__version__ = "0.36.0-dev22"

From f2525e4cedf7ab61fa430387d476c585eddf8256 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Thu, 4 Apr 2024 14:33:37 -0400
Subject: [PATCH 08/21] Trigger CI


From 4ad7ca0f0d24d1431eb9a51afa6a4543aa07a107 Mon Sep 17 00:00:00 2001
From: Lee James O'Riordan <mlxd@users.noreply.github.com>
Date: Fri, 5 Apr 2024 08:50:10 -0400
Subject: [PATCH 09/21] Update
 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>
---
 .../development/avx_kernels/kernel_tuning.rst               | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
index b0cf7d99e0..636886c59a 100644
--- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
+++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
@@ -1,13 +1,13 @@
 Kernel performance tuning
 #########################
 
-Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentation method implementation, and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload.
+Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentiation method implementation and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload.
 
 However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate-kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels.
 
 OpenMP threaded kernels
 -----------------------
 
-To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=on` CMake flag. Not, that for gradient-workloads with many observables this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
+To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=on` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
 
-For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations through use of the `-DLQ_ENABLE_KERNEL_AVX_STREAMING=on` CMake flag. This forces the data to avoid updating the CPU cache, and can improve performance for larger workloads.
\ No newline at end of file
+For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the `-DLQ_ENABLE_KERNEL_AVX_STREAMING=on` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads.
\ No newline at end of file

From ae9809ef02cec9237b45544f9085c814c9871a04 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Fri, 19 Apr 2024 16:12:07 -0400
Subject: [PATCH 10/21] Update changelog

---
 .github/CHANGELOG.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 8b233348a7..b8f7eb9e2a 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### New features since last release
 
+* Add compile-time support for AVX2/512 streaming operations in `lightning.qubit`.
+  [(#664)](https://github.com/PennyLaneAI/pennylane-lightning/pull/664)
+
 * `lightning.kokkos` supports mid-circuit measurements.
   [(#672)](https://github.com/PennyLaneAI/pennylane-lightning/pull/672)
 
@@ -108,7 +111,7 @@
 
 This release contains contributions from (in alphabetical order):
 
-Ali Asadi, Amintor Dusko, Christina Lee, Vincent Michaud-Rioux, Mudit Pandey, Shuli Shu
+Ali Asadi, Amintor Dusko, Christina Lee, Vincent Michaud-Rioux, Lee James O'Riordan, Mudit Pandey, Shuli Shu
 
 ---
 

From 7377b2eb3e7eb04945959ce4e7020a651c9394c3 Mon Sep 17 00:00:00 2001
From: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Date: Fri, 19 Apr 2024 20:14:52 +0000
Subject: [PATCH 11/21] Auto update version

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 9c6e90e856..01c5ebb276 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.36.0-dev33"
+__version__ = "0.36.0-dev34"

From 75e31ca3b38c6b25b01bda9cc355947be0eb8e49 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Fri, 19 Apr 2024 16:18:24 -0400
Subject: [PATCH 12/21] Trigger CI


From d0aaeece37eee4f446e2e458e6d3f012bdebbfd4 Mon Sep 17 00:00:00 2001
From: Lee James O'Riordan <mlxd@users.noreply.github.com>
Date: Wed, 24 Apr 2024 09:39:35 -0400
Subject: [PATCH 13/21] Update
 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
---
 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
index 636886c59a..81fcf2ef19 100644
--- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
+++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
@@ -3,7 +3,7 @@ Kernel performance tuning
 
 Lightning-Qubit's kernel implementations are by default tuned for high throughput single-threaded performance with gradient workloads. To enable this, we add OpenMP threading within the adjoint differentiation method implementation and use SIMD-level intrinsics to ensure fast performance for each given circuit in such a workload.
 
-However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate-kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels.
+However, sometimes we may want to modify the above defaults to favour a given workload, such as by enabling multi-threaded execution of the gate kernels instead. For this, we have several compile-time flags to change the operating behaviour of Lightning-Qubit kernels.
 
 OpenMP threaded kernels
 -----------------------

From 2b1236ebe07c65e0c6f66e309d7ddfe308a660e5 Mon Sep 17 00:00:00 2001
From: Lee James O'Riordan <mlxd@users.noreply.github.com>
Date: Wed, 24 Apr 2024 09:39:41 -0400
Subject: [PATCH 14/21] Update
 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
---
 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
index 81fcf2ef19..11f9b1cc40 100644
--- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
+++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
@@ -8,6 +8,6 @@ However, sometimes we may want to modify the above defaults to favour a given wo
 OpenMP threaded kernels
 -----------------------
 
-To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=on` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
+To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=ON` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
 
 For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the `-DLQ_ENABLE_KERNEL_AVX_STREAMING=on` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads.
\ No newline at end of file

From 1b4129cecff2b5551675ac6560dd2e143a598e54 Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Wed, 24 Apr 2024 13:39:56 +0000
Subject: [PATCH 15/21] Auto update version from '0.36.0-dev34' to
 '0.36.0-dev37'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 01c5ebb276..3e9cf3b0e4 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.36.0-dev34"
+__version__ = "0.36.0-dev37"

From 22e19820f473b71feb35c94605d4b74e675d37c9 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <loriordan@gmail.com>
Date: Wed, 24 Apr 2024 10:36:22 -0400
Subject: [PATCH 16/21] Updates from code review

---
 .github/workflows/tests_linux.yml                              | 3 ++-
 .../core/src/simulators/lightning_qubit/CMakeLists.txt         | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests_linux.yml b/.github/workflows/tests_linux.yml
index 68f126eaf7..df9ef214a8 100644
--- a/.github/workflows/tests_linux.yml
+++ b/.github/workflows/tests_linux.yml
@@ -94,7 +94,8 @@ jobs:
               lcov --remove coverage.info '/usr/*' --output-file coverage.info
               cd ..
             done
-            lcov --add-tracefile ./Build/coverage.info -a ./BuildKernelOMP/coverage.info -o coverage.info
+            lcov  --add-tracefile ./Build/coverage.info -a ./BuildKernelOMP/coverage.info \
+                  --add-tracefile ./BuildKernelAVXStream/coverage.info -o coverage.info
             mv coverage.info coverage-${{ github.job }}-${{ matrix.pl_backend }}.info
 
       - name: Upload test results
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
index 499057a10b..f07b94923c 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/CMakeLists.txt
@@ -53,6 +53,9 @@ if(LQ_ENABLE_KERNEL_OMP)
 endif()
 
 if(LQ_ENABLE_KERNEL_AVX_STREAMING)
+    if(NOT LQ_ENABLE_KERNEL_OMP)
+        message(WARNING "AVX streaming operations require `LQ_ENABLE_KERNEL_OMP` to be enabled.")
+    endif()
     add_definitions("-DPL_LQ_KERNEL_AVX_STREAMING")
 endif()
 

From 9d437842010afff5b8157b5eee50ceba2558d257 Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Wed, 24 Apr 2024 14:59:03 +0000
Subject: [PATCH 17/21] Auto update version from '0.36.0-dev37' to
 '0.36.0-dev38'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 3e9cf3b0e4..47c268d60b 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.36.0-dev37"
+__version__ = "0.36.0-dev38"

From 540052f9938d2dd85106ca9809d026cfc2d6da9f Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Wed, 24 Apr 2024 18:00:28 +0000
Subject: [PATCH 18/21] Auto update version from '0.36.0-dev38' to
 '0.36.0-dev39'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 47c268d60b..faa706866c 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.36.0-dev38"
+__version__ = "0.36.0-dev39"

From 4be3c7f53625fef73b762ad10699c1d3e837fec8 Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Thu, 25 Apr 2024 14:37:24 +0000
Subject: [PATCH 19/21] Auto update version from '0.36.0-dev40' to
 '0.36.0-dev41'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 06b4d144a5..fd9437debb 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.36.0-dev40"
+__version__ = "0.36.0-dev41"

From 479c287a7228e7cd94a190ae8e8cad54c0d6e83e Mon Sep 17 00:00:00 2001
From: Lee James O'Riordan <mlxd@users.noreply.github.com>
Date: Thu, 25 Apr 2024 10:46:31 -0400
Subject: [PATCH 20/21] Update
 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
---
 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
index 11f9b1cc40..cef428c3cb 100644
--- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
+++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
@@ -10,4 +10,4 @@ OpenMP threaded kernels
 
 To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=ON` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
 
-For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the `-DLQ_ENABLE_KERNEL_AVX_STREAMING=on` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads.
\ No newline at end of file
+For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads.
\ No newline at end of file

From 5b43dbeae63a987952cbe9f252f843098ec2f8a4 Mon Sep 17 00:00:00 2001
From: Lee James O'Riordan <mlxd@users.noreply.github.com>
Date: Thu, 25 Apr 2024 10:46:39 -0400
Subject: [PATCH 21/21] Update
 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst

Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
---
 doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
index cef428c3cb..bc65e33f59 100644
--- a/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
+++ b/doc/lightning_qubit/development/avx_kernels/kernel_tuning.rst
@@ -8,6 +8,6 @@ However, sometimes we may want to modify the above defaults to favour a given wo
 OpenMP threaded kernels
 -----------------------
 
-To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the `-DLQ_ENABLE_KERNEL_OMP=ON` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
+To enable OpenMP acceleration of the gate kernels, Lightning-Qubit can be compiled with the ``-DLQ_ENABLE_KERNEL_OMP=ON`` CMake flag. Not, that for gradient workloads with many observables, this may reduce performance in comparison with the default mode, so this behaviour is opt-in only.
 
 For workloads that show benefit from the use of threaded gate kernels, sometimes updating the CPU cache to accommodate recently modified data can become a bottleneck, and saturates the performance gained at high thread counts. This may be alleviated somewhat on systems supporting AVX2 and AVX-512 operations using the ``-DLQ_ENABLE_KERNEL_AVX_STREAMING=on`` CMake flag. This forces the data to avoid updating the CPU cache and can improve performance for larger workloads.
\ No newline at end of file