From fa988d0d9f867b3cfdef6feed6057bd196731bee Mon Sep 17 00:00:00 2001
From: Shuli <08cnbj@gmail.com>
Date: Wed, 18 Oct 2023 21:29:58 +0000
Subject: [PATCH 01/30] init commit

---
 README.rst                         |  44 ++++++++--
 doc/code/__init__.rst              |   6 ++
 doc/index.rst                      |   6 ++
 doc/installation.rst               |   6 ++
 doc/lightning_gpu/device.rst       | 134 +++++++++++++++++++++++++++++
 doc/lightning_gpu/installation.rst |   3 +
 doc/lightning_gpu/package.rst      |  18 ++++
 requirements-dev.txt               |   3 +-
 requirements.txt                   |   1 +
 9 files changed, 212 insertions(+), 9 deletions(-)
 create mode 100644 doc/lightning_gpu/device.rst
 create mode 100644 doc/lightning_gpu/installation.rst
 create mode 100644 doc/lightning_gpu/package.rst
diff --git a/README.rst b/README.rst
index 3e128524fd..a05336b55c 100644
--- a/README.rst
+++ b/README.rst
@@ -283,21 +283,49 @@ For a system with access to the ROCm stack outside of a manylinux container, an
 
 .. installation_LKokkos-end-inclusion-marker-do-not-remove
 
-Please refer to the `plugin documentation <https://docs.pennylane.ai/projects/lightning/>`_ as
-well as to the `PennyLane documentation <https://docs.pennylane.ai/>`_ for further reference.
+.. installation_LGPU-start-inclusion-marker-do-not-remove
+Lightning GPU installation
+=============================
+
+Note that Lightning Qubit should be installed before Lightning-GPU.
+
+Lightning-GPU can be installed using ``pip``:
 
+.. code-block:: console
+    pip install pennylane-lightning[gpu]
+Use of Lightning-GPU also requires explicit installation of the NVIDIA cuQuantum SDK. The SDK library directory may be provided on the ``LD_LIBRARY_PATH`` environment variable, or the SDK Python package may be installed within the Python environment ``site-packages`` directory using ``pip`` or ``conda``. Please see the `cuQuantum SDK <https://developer.nvidia.com/cuquantum-sdk>`_ install guide for more information.
 
-GPU support
------------
+Building from source
+--------------------
 
-For GPU support, `PennyLane-Lightning-GPU <https://github.com/PennyLaneAI/pennylane-lightning-gpu>`_
-can be installed by providing the optional ``[gpu]`` tag:
+To build a wheel from the package sources using the direct SDK path:
+
+.. code-block:: console
+    PL_BACKEND="lightning_gpu" python -m pip install -e .
+To build using the PyPI/Conda installed cuQuantum package:
 
 .. code-block:: console
+    cmake -BBuild -DPL_BACKEND=lightning_gpu -DCUQUANTUM_SDK=<path to sdk>
+    cmake --build ./Build --verbose
+    python -m pip install wheel
+    PL_BACKEND="lightning_gpu" python setup.py build_ext
+    PL_BACKEND="lightning_gpu" python setup.py bdist_wheel
+The built wheel can now be installed as:
 
-    $ pip install pennylane-lightning[gpu]
+.. code-block:: console
+    python -m pip install ./dist/PennyLane_Lightning_GPU-*.whl
+To simplify the build, we recommend using the following containerized build process, which creates `manylinux2014 <https://github.com/pypa/manylinux>`_ compatible wheels.
+
+To build the plugin directly with CMake:
+
+.. code-block:: console
+   cmake -B build -DCUQUANTUM_SDK=<path to sdk> -DBUILD_TESTS=ON -DPL_BACKEND=lightning_gpu -G Ninja
+   cmake --build build
+.. installation_LGPU-end-inclusion-marker-do-not-remove
+
+Please refer to the `plugin documentation <https://docs.pennylane.ai/projects/lightning/>`_ as
+well as to the `PennyLane documentation <https://docs.pennylane.ai/>`_ for further reference.
 
-For more information, please refer to the PennyLane Lightning GPU `documentation <https://docs.pennylane.ai/projects/lightning-gpu>`_.
 
 Docker Support
 --------------
diff --git a/doc/code/__init__.rst b/doc/code/__init__.rst
index bf68bf024a..61d4a44caf 100644
--- a/doc/code/__init__.rst
+++ b/doc/code/__init__.rst
@@ -24,6 +24,11 @@ This section contains the API documentation for the Lightning packages.
    :description: API documentation for the lightning_kokkos package
    :link: ../lightning_kokkos/package.html
 
+.. title-card::
+   :name: lightning_gpu
+   :description: API documentation for the lightning_gpu package
+   :link: ../lightning_gpu/package.html
+
 .. raw:: html
 
         <div style='clear:both'></div>
@@ -34,3 +39,4 @@ This section contains the API documentation for the Lightning packages.
 
    ../lightning_qubit/package
    ../lightning_kokkos/package
+   ../lightning_gpu/package
diff --git a/doc/index.rst b/doc/index.rst
index c9316bd782..25239c62c1 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -28,6 +28,11 @@ The Lightning ecosystem provides the following devices:
     :description: A heterogeneous backend state-vector simulator with Kokkos library support.
     :link: lightning_kokkos/device.html
 
+.. title-card::
+    :name: 'lightning.gpu'
+    :description: A heterogeneous backend state-vector simulator with NVIDIA cuQuantum library support.
+    :link: lightning_gpu/device.html
+
 .. raw:: html
 
     <div style='clear:both'></div>
@@ -48,6 +53,7 @@ The Lightning ecosystem provides the following devices:
 
    lightning_qubit/device
    lightning_kokkos/device
+   lightning_gpu/device
 
 .. toctree::
    :maxdepth: 2
diff --git a/doc/installation.rst b/doc/installation.rst
index d89f62a24c..16300d2a13 100644
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -14,6 +14,11 @@ Each device in the Lightning ecosystem is a separate Python package. Select the
    :description: Guidelines to installing and testing the Lightning Kokkos device
    :link: ./lightning_kokkos/installation.html
 
+.. title-card::
+   :name: Lightning GPU
+   :description: Guidelines to installing and testing the Lightning GPU device
+   :link: ./lightning_gpu/installation.html
+
 .. raw:: html
 
         <div style='clear:both'></div>
@@ -24,3 +29,4 @@ Each device in the Lightning ecosystem is a separate Python package. Select the
 
    lightning_qubit/installation
    lightning_kokkos/installation
+   lightning_gpu/installation
diff --git a/doc/lightning_gpu/device.rst b/doc/lightning_gpu/device.rst
new file mode 100644
index 0000000000..41c52de564
--- /dev/null
+++ b/doc/lightning_gpu/device.rst
@@ -0,0 +1,134 @@
+Lightning-GPU device
+======================
+
+The ``lightning.gpu`` device is an extension of PennyLane's built-in ``lightning.qubit`` device.
+It extends the CPU-focused Lightning simulator to run using the NVIDIA cuQuantum SDK, enabling GPU-accelerated simulation of quantum state-vector evolution.
+
+A ``lightning.gpu`` device can be loaded using:
+
+.. code-block:: python
+    import pennylane as qml
+    dev = qml.device("lightning.gpu", wires=2)
+If the NVIDIA cuQuantum libraries are available, the above device will allow all operations to be performed on a CUDA capable GPU of generation SM 7.0 (Volta) and greater. If the libraries are not correctly installed, or available on path, the device will fall-back to ``lightning.qubit`` and perform all simulation on the CPU.
+
+The ``lightning.gpu`` device also directly supports quantum circuit gradients using the adjoint differentiation method. This can be enabled at the PennyLane QNode level with:
+
+.. code-block:: python
+    qml.qnode(dev, diff_method="adjoint")
+    def circuit(params):
+        ...
+Check out the :doc:`/lightning_gpu/installation` guide for more information.
+
+Supported operations and observables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Supported operations:**
+
+.. raw:: html
+
+    <div class="summary-table">
+
+.. autosummary::
+    :nosignatures:
+
+    ~pennylane.BasisState
+    ~pennylane.CNOT
+    ~pennylane.ControlledPhaseShift
+    ~pennylane.ControlledQubitUnitary
+    ~pennylane.CPhase
+    ~pennylane.CRot
+    ~pennylane.CRX
+    ~pennylane.CRY
+    ~pennylane.CRZ
+    ~pennylane.CSWAP
+    ~pennylane.CY
+    ~pennylane.CZ
+    ~pennylane.DiagonalQubitUnitary
+    ~pennylane.DoubleExcitation
+    ~pennylane.DoubleExcitationMinus
+    ~pennylane.DoubleExcitationPlus
+    ~pennylane.ECR
+    ~pennylane.Hadamard
+    ~pennylane.Identity
+    ~pennylane.IsingXX
+    ~pennylane.IsingXY
+    ~pennylane.IsingYY
+    ~pennylane.IsingZZ
+    ~pennylane.ISWAP
+    ~pennylane.MultiControlledX
+    ~pennylane.MultiRZ
+    ~pennylane.OrbitalRotation
+    ~pennylane.PauliX
+    ~pennylane.PauliY
+    ~pennylane.PauliZ
+    ~pennylane.PhaseShift
+    ~pennylane.PSWAP
+    ~pennylane.QFT
+    ~pennylane.QubitCarry
+    ~pennylane.QubitStateVector
+    ~pennylane.QubitSum
+    ~pennylane.QubitUnitary
+    ~pennylane.Rot
+    ~pennylane.RX
+    ~pennylane.RY
+    ~pennylane.RZ
+    ~pennylane.S
+    ~pennylane.SingleExcitation
+    ~pennylane.SingleExcitationMinus
+    ~pennylane.SingleExcitationPlus
+    ~pennylane.SISWAP
+    ~pennylane.SQISW
+    ~pennylane.SWAP
+    ~pennylane.SX
+    ~pennylane.T
+    ~pennylane.Toffoli
+
+.. raw:: html
+
+    </div>
+
+**Supported observables:**
+
+.. raw:: html
+
+    <div class="summary-table">
+
+.. autosummary::
+    :nosignatures:
+
+    ~pennylane.Hadamard
+    ~pennylane.Identity
+    ~pennylane.PauliX
+    ~pennylane.PauliY
+    ~pennylane.PauliZ
+    ~pennylane.Hamiltonian
+    ~pennylane.SparseHamiltonian
+    ~pennylane.Hermitian
+    ~pennylane.Sum
+    ~pennylane.Prod
+    ~pennylane.SProd
+
+.. raw:: html
+
+    </div>
+
+
+
+**Parallel adjoint differentiation support:**
+
+The ``lightning.gpu`` device directly supports the `adjoint differentiation method <https://pennylane.ai/qml/demos/tutorial_adjoint_diff.html>`__, and enables parallelization over the requested observables. This supports direct controlling of observable batching, which can be used to run concurrent calculations across multiple available GPUs.
+
+If you are computing a large number of expectation values, or if you are using a large number of wires on your device, it may be best to evenly divide the number of expectation value calculations across all available GPUs. This will reduce the overall memory cost of the observables per GPU, at the cost of additional compute time. Assuming `m` observables, and `n` GPUs, the default behaviour is to pre-allocate all storage for `n` observables on a single GPU. To divide the workload amongst many GPUs, initialize a ``lightning.gpu`` device with the ``batch_obs=True`` keyword argument, as:
+
+.. code-block:: python
+    import pennylane as qml
+    dev = qml.device("lightning.gpu", wires=20, batch_obs=True)
+With the above, each GPU will see at most `m/n` observables to process, reducing the preallocated memory footprint.
+
+Additionally, there can be situations where even with the above distribution, and limited GPU memory, the overall problem does not fit on the requested GPU devices. You can further reduce the concurrent allocations on available GPUs by providing an integer value to the `batch_obs` keyword. For example, to batch evaluate observables with at most 1 observable allocation per GPU, define the device as:
+
+.. code-block:: python
+    import pennylane as qml
+    dev = qml.device("lightning.gpu", wires=27, batch_obs=1)
+Each problem is unique, so it can often be best to choose the default behaviour up-front, and tune with the above only if necessary.
+ 
\ No newline at end of file
diff --git a/doc/lightning_gpu/installation.rst b/doc/lightning_gpu/installation.rst
new file mode 100644
index 0000000000..9754aae396
--- /dev/null
+++ b/doc/lightning_gpu/installation.rst
@@ -0,0 +1,3 @@
+.. include:: ../../README.rst
+  :start-after:	installation_LGPU-start-inclusion-marker-do-not-remove
+  :end-before: installation_LGPU-end-inclusion-marker-do-not-remove
\ No newline at end of file
diff --git a/doc/lightning_gpu/package.rst b/doc/lightning_gpu/package.rst
new file mode 100644
index 0000000000..4a82eaa09e
--- /dev/null
+++ b/doc/lightning_gpu/package.rst
@@ -0,0 +1,18 @@
+lightning_gpu
+================
+
+.. automodapi:: pennylane_lightning.lightning_gpu
+    :no-heading:
+    :include-all-objects:
+
+.. raw:: html
+
+        <div style='clear:both'></div>
+        </br>
+
+Directly importing the device class:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python3
+    from pennylane_lightning.lightning_gpu import LightningGPU
+
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b377d8acd1..d46b1a7e90 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -7,4 +7,5 @@ pytest
 pytest-cov
 pytest-mock
 black==23.7.0
-clang-format==14
\ No newline at end of file
+clang-format==14
+custatevec-cu11
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index af606a496a..ef5c73ca83 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ pybind11
 pytest
 pytest-cov
 pytest-mock
+custatevec-cu11
\ No newline at end of file

From fb3bbc80b4073065b180e115a6ab697c162e4575 Mon Sep 17 00:00:00 2001
From: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Date: Wed, 18 Oct 2023 21:31:10 +0000
Subject: [PATCH 02/30] Auto update version

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index d325038b25..af71dec2d3 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.33.0-dev21"
+__version__ = "0.33.0-dev22"

From f455882d1ff34b2488da35233d27ec84ffbd366b Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 18 Oct 2023 17:38:25 -0400
Subject: [PATCH 03/30] add changelog

---
 .github/CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 8d57d4dcb2..d6e25380dd 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -1,7 +1,9 @@
 # Release 0.33.0-dev
 
 ### New features since last release
-
+* Add docs to `lightning_gpu` with single-GPU features.
+  [(#525)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/525)
+  
 * Integrate Lightning-GPU into the Lightning monorepo. The new backend is named `lightning_gpu` and includes all single-GPU features.
   [(#499)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/499)
 

From 6f60d8718cb5efa35960dc642ad5186f9a47f798 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Tue, 24 Oct 2023 16:26:38 -0400
Subject: [PATCH 04/30] Update readme.

---
 README.rst | 369 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 221 insertions(+), 148 deletions(-)

diff --git a/README.rst b/README.rst
index a05336b55c..51385272e6 100644
--- a/README.rst
+++ b/README.rst
@@ -1,3 +1,4 @@
+#################
 Lightning Plugins
 #################
 
@@ -44,57 +45,107 @@ PennyLane supports Python 3.9 and above.
 .. header-end-inclusion-marker-do-not-remove
 
 
+********
 Features
-========
+********
+
+PennyLane-Lightning high performance simulators include the following backends:
+
+* ``lightning.qubit``: is a fast state-vector simulator written in C++.
+* ``lightning.gpu``: is a state-vector simulator based on the `NVIDIA cuQuantum SDK <https://developer.nvidia.com/cuquantum-sdk>`_. It notably implements a distributed state-vector simulator based on MPI.
+* ``lightning.kokkos``: is a state-vector simulator written with `Kokkos <https://kokkos.github.io/kokkos-core-wiki/index.html>`_. It can exploit the inherent parallelism of modern processing units supporting the `OpenMP <https://www.openmp.org/>`_, `CUDA <https://developer.nvidia.com/cuda-toolkit>`_ or `HIP <https://docs.amd.com/projects/HIP/en/docs-5.3.0/index.html>`_ programming models.
+
+The following platforms are supported:
+
++-----------+---------+--------+-------------+----------------+-----------------+----------------+
+|           | L-Qubit | L-GPU  | L-GPU (MPI) | L-Kokkos (OMP) | L-Kokkos (CUDA) | L-Kokkos (HIP) |
++===========+=========+========+=============+================+=================+================+
+| Linux x86 | pip     | pip    | source      | pip            | source          | source         |
++-----------+---------+--------+-------------+----------------+-----------------+----------------+
+| Linux ARM | pip     | source |             | pip            | source          | source         |
++-----------+---------+--------+-------------+----------------+-----------------+----------------+
+| Linux PPC | pip     | source |             | pip            | source          | source         |
++-----------+---------+--------+-------------+----------------+-----------------+----------------+
+| MacOS x86 | pip     |        |             | pip            |                 |                |
++-----------+---------+--------+-------------+----------------+-----------------+----------------+
+| MacOS ARM | pip     |        |             | pip            |                 |                |
++-----------+---------+--------+-------------+----------------+-----------------+----------------+
+| Windows   | pip     |        |             |                |                 |                |
++-----------+---------+--------+-------------+----------------+-----------------+----------------+
 
-* Combine Lightning's high performance simulators with PennyLane's
-  automatic differentiation and optimization.
 
 .. installation_LQubit-start-inclusion-marker-do-not-remove
 
 
-Lightning Qubit installation
-============================
+************
+Installation
+************
+
+Pip installation
+================
+
+PyPI wheels
+-----------
 
-Lightning Qubit can be installed using ``pip``:
+Lightning plugins can be installed using ``pip`` as follows
 
 .. code-block:: console
 
     $ pip install pennylane-lightning
 
-To build Lightning from source you can run
+The above command will install the Lightning-Qubit plugin (the default since it is most broadly supported).
+In order to install the Lightning-GPU and Lightning-Kokkos (OpenMP) backends, you can respectively use the following commands:
+
+.. code-block:: console
+
+    $ pip install pennylane-lightning[gpu]
+    $ pip install pennylane-lightning[kokkos]
+
+
+Installing from source
+----------------------
+
+To build Lightning plugins from source you can run
 
 .. code-block:: console
 
-    $ pip install pybind11 pennylane-lightning --no-binary :all:
+    $ PL_BACKEND=${PL_BACKEND} pip install pybind11 pennylane-lightning --no-binary :all:
+
+where ``${PL_BACKEND}`` can be ``lightning_qubit`` (default), ``lightning_gpu`` or ``lightning_kokkos``.
+The `pybind11 <https://pybind11.readthedocs.io/en/stable/>`_ library is required to bind the C++ functionality to Python.
 
 A C++ compiler such as ``g++``, ``clang++``, or ``MSVC`` is required.
 On Debian-based systems, this can be installed via ``apt``:
 
 .. code-block:: console
 
-    $ sudo apt install g++
+    $ sudo apt -y update && 
+    $ sudo apt install g++ libomp-dev
 
+where ``libomp-dev`` is included to also install OpenMP.
 On MacOS, we recommend using the latest version of ``clang++`` and ``libomp``:
 
 .. code-block:: console
 
     $ brew install llvm libomp
 
-The `pybind11 <https://pybind11.readthedocs.io/en/stable/>`_ library is also used for binding the
-C++ functionality to Python.
+The Lightning-GPU backend has several dependencies (e.g. ``CUDA``, ``custatevec-cu11``, etc.), and hence we recommend referring to `Lightning-GPU`_ section below.
+Similarly, for Lightning-Kokkos it is recommended to configure and install Kokkos independently as prescribed in the `Lightning-Kokkos`_ section below.
+
+Development installation
+------------------------
 
-Alternatively, for development and testing, you can install by cloning the repository:
+For development and testing, you can install by cloning the repository:
 
 .. code-block:: console
 
     $ git clone https://github.com/PennyLaneAI/pennylane-lightning.git
     $ cd pennylane-lightning
     $ pip install -r requirements.txt
-    $ pip install -e .
+    $ PL_BACKEND=${PL_BACKEND} pip install -e . -vv
 
 Note that subsequent calls to ``pip install -e .`` will use cached binaries stored in the
-``build`` folder. Run ``make clean`` if you would like to recompile.
+``build`` folder. Run ``make clean`` if you would like to recompile from scratch.
 
 You can also pass ``cmake`` options with ``CMAKE_ARGS`` as follows:
 
@@ -109,26 +160,35 @@ or with ``build_ext`` and the ``--define`` flag as follows:
     $ python3 setup.py build_ext -i --define="ENABLE_OPENMP=OFF;ENABLE_BLAS=OFF"
     $ python3 setup.py develop
 
+where ``-D`` must not be included before ``;``-separated options.
 
-Testing
--------
+Compile on Windows with MSVC
+----------------------------
+
+Lightning-Qubit can be compiled on Windows using the
+`Microsoft Visual C++ <https://visualstudio.microsoft.com/vs/features/cplusplus/>`_ compiler.
+You need `cmake <https://cmake.org/download/>`_ and appropriate Python environment
+(e.g. using `Anaconda <https://www.anaconda.com/>`_).
 
-To test that the plugin is working correctly you can test the Python code within the cloned
-repository:
+We recommend using ``[x64 (or x86)] Native Tools Command Prompt for VS [version]`` to compile the library.
+Be sure that ``cmake`` and ``python`` can be called within the prompt.
 
 .. code-block:: console
 
-    $ make test-python
+    $ cmake --version
+    $ python --version
 
-while the C++ code can be tested with
+Then a common command will work.
 
 .. code-block:: console
 
-    $ make test-cpp
+    $ pip install -r requirements.txt
+    $ pip install -e .
 
+Note that OpenMP and BLAS are disabled on this platform.
 
-CMake Support
--------------
+CMake support
+=============
 
 One can also build the plugin using CMake:
 
@@ -137,212 +197,222 @@ One can also build the plugin using CMake:
     $ cmake -S. -B build
     $ cmake --build build
 
-To test the C++ code:
-
-.. code-block:: console
-
-    $ mkdir build && cd build
-    $ cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Debug ..
-    $ make
-
-Other supported options are
+Supported options are
 
-- ``-DENABLE_WARNINGS=ON``
-- ``-DENABLE_NATIVE=ON`` (for ``-march=native``)
-- ``-DENABLE_BLAS=ON``
-- ``-DENABLE_OPENMP=ON``
-- ``-DENABLE_CLANG_TIDY=ON``
+- ``-DENABLE_WARNINGS:BOOL=ON``
+- ``-DENABLE_NATIVE:BOOL=ON`` (for ``-march=native``)
+- ``-DENABLE_BLAS:BOOL=ON``
+- ``-DENABLE_OPENMP:BOOL=ON``
+- ``-DENABLE_CLANG_TIDY:BOOL=ON``
 
-Compile on Windows with MSVC
-----------------------------
-
-You can also compile Lightning on Windows using
-`Microsoft Visual C++ <https://visualstudio.microsoft.com/vs/features/cplusplus/>`_ compiler.
-You need `cmake <https://cmake.org/download/>`_ and appropriate Python environment
-(e.g. using `Anaconda <https://www.anaconda.com/>`_).
-
-
-We recommend to use ``[x64 (or x86)] Native Tools Command Prompt for VS [version]`` for compiling the library.
-Be sure that ``cmake`` and ``python`` can be called within the prompt.
+*******
+Testing
+*******
 
+To test that a plugin is working correctly, test the Python code with:
 
 .. code-block:: console
 
-    $ cmake --version
-    $ python --version
+    $ make test-python device=${PL_DEVICE}
 
-Then a common command will work.
+where ``${PL_DEVICE}`` can be ``lightning.qubit`` (default), ``lightning.gpu`` or ``lightning.kokkos``.
+These differ from ``${PL_BACKEND}`` by replacing the underscore by a dot.
+The C++ code can be tested with
 
 .. code-block:: console
 
-    $ pip install -r requirements.txt
-    $ pip install -e .
-
-Note that OpenMP and BLAS are disabled in this setting.
+    $ PL_BACKEND=${PL_BACKEND} make test-cpp
 
 
 .. installation_LQubit-end-inclusion-marker-do-not-remove
 
+.. installation_LGPU-start-inclusion-marker-do-not-remove
 
-.. installation_LKokkos-start-inclusion-marker-do-not-remove
+*************
+Lightning-GPU
+*************
 
-Lightning Kokkos installation
-=============================
+PyPI wheels
+===========
 
-For linux systems, `lightning.kokkos` and be readily installed with an OpenMP backend by providing the optional ``[kokkos]`` tag: 
+Lightning-GPU can be installed using ``pip``:
 
 .. code-block:: console
 
-    $ pip install pennylane-lightning[kokkos]
-
-This can be explicitly installed through PyPI as:
-
-.. code-block:: console
+    pip install pennylane-lightning[gpu]
 
-    $ pip install pennylane-lightning-kokkos
+Lightning-GPU requires the `cuQuantum SDK <https://developer.nvidia.com/cuquantum-sdk>`_ (only the `cuStateVec <https://docs.nvidia.com/cuda/cuquantum/latest/custatevec/index.html>`_ library is required).
+The SDK may be installed within the Python environment ``site-packages`` directory using ``pip`` or ``conda`` or the SDK library path appended to the ``LD_LIBRARY_PATH`` environment variable.
+Please see the `cuQuantum SDK <https://developer.nvidia.com/cuquantum-sdk>`_ install guide for more information.
 
+Installing from source
+======================
 
-Building from source
---------------------
+To build a wheel from the package sources using the direct SDK path:
 
-As Kokkos enables support for many different HPC-targetted hardware platforms, `lightning.kokkos` can be built to support any of these platforms when building from source.
+.. code-block:: console
 
-We suggest first installing Kokkos with the wanted configuration following the instructions found in the `Kokkos documentation <https://kokkos.github.io/kokkos-core-wiki/building.html>`_.
-Next, append the install location to ``CMAKE_PREFIX_PATH``.
-If an installation is not found, our builder will clone and install it during the build process.
+    python -m pip install wheel custatevec-cu11
+    python setup.py build_ext --cuquantum=<path to sdk>
+    python setup.py bdist_wheel
 
-The simplest way to install PennyLane-Lightning-Kokkos (OpenMP backend) is using ``pip``.
+The wheel can then be installed with:
 
 .. code-block:: console
 
-   CMAKE_ARGS="-DKokkos_ENABLE_OPENMP=ON" PL_BACKEND="lightning_kokkos" python -m pip install .
+    python -m pip install ./dist/PennyLane_Lightning_GPU-*.whl
 
-or for an editable ``pip`` installation with:
+To simplify the build, we recommend using the containerized build process described in section `Docker support`_.
 
-.. code-block:: console
+Build Lightning-GPU with MPI
+----------------------------
 
-   CMAKE_ARGS="-DKokkos_ENABLE_OPENMP=ON" PL_BACKEND="lightning_kokkos" python -m pip install -e .
+Building Lightning-GPU with MPI also requires the ``NVIDIA cuQuantum SDK`` (currently supported version: `custatevec-cu11 <https://pypi.org/project/cuquantum-cu11/>`_), ``mpi4py`` and ``CUDA-aware MPI`` (Message Passing Interface).
+``CUDA-aware MPI`` allows data exchange between GPU memory spaces of different nodes without the need for CPU-mediated transfers.
+Both the ``MPICH`` and ``OpenMPI`` libraries are supported, provided they are compiled with CUDA support.
+The path to ``libmpi.so`` should be found in ``LD_LIBRARY_PATH``.
+It is recommended to install the ``NVIDIA cuQuantum SDK`` and ``mpi4py`` Python package within ``pip`` or ``conda`` inside a virtual environment.
+Please consult the `cuQuantum SDK <https://developer.nvidia.com/cuquantum-sdk>`_ , `mpi4py <https://mpi4py.readthedocs.io/en/stable/install.html>`_,
+`MPICH <https://www.mpich.org/static/downloads/4.1.1/mpich-4.1.1-README.txt>`_, or `OpenMPI <https://www.open-mpi.org/faq/?category=buildcuda>`_ install guide for more information.
 
-Alternatively, you can install the Python interface with:
+To build a wheel with MPI support using the direct SDK path:
 
 .. code-block:: console
 
-   CMAKE_ARGS="-DKokkos_ENABLE_OPENMP=ON" PL_BACKEND="lightning_kokkos" python setup.py build_ext
-   python setup.py bdist_wheel
-   pip install ./dist/PennyLane*.whl --force-reinstall
+    python -m pip install wheel 
+    python setup.py build_ext --define="ENABLE_MPI=ON" --cuquantum=<path to sdk>
+    python setup.py bdist_wheel
 
-To build the plugin directly with CMake:
+If cuStateVec is installed with ``pip``, you may set ``--cuquantum=${CUQUANTUM_SDK}`` where
 
 .. code-block:: console
 
-   cmake -B build -DKokkos_ENABLE_OPENMP=ON -DPLKOKKOS_BUILD_TESTS=ON -DPL_BACKEND=lightning_kokkos -G Ninja
-   cmake --build build
+    export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum/lib')")
 
-The supported backend options are "SERIAL", "OPENMP", "THREADS", "HIP" and "CUDA" and the corresponding build options are ``-DKokkos_ENABLE_XXX=ON``, where ``XXX`` needs be replaced by the backend name, for instance ``OPENMP``.
-One can activate simultaneously one serial, one parallel CPU host (e.g. "OPENMP", "THREADS") and one parallel GPU device backend (e.g. "HIP", "CUDA"), but not two of any category at the same time.
-For "HIP" and "CUDA", the appropriate software stacks are required to enable compilation and subsequent use.
-Similarly, the CMake option ``-DKokkos_ARCH_{...}=ON`` must also be specified to target a given architecture.
-A list of the architectures is found on the `Kokkos wiki <https://github.com/kokkos/kokkos/wiki/Macros#architectures>`_.
-Note that "THREADS" backend is not recommended since `Kokkos <https://github.com/kokkos/kokkos-core-wiki/blob/17f08a6483937c26e14ec3c93a2aa40e4ce081ce/docs/source/ProgrammingGuide/Initialization.md?plain=1#L67>`_ does not guarantee its safety.
+The wheel can then be installed as:
 
+.. code-block:: console
+
+    python -m pip install ./dist/PennyLane_Lightning_GPU-*.whl
 
 Testing
 =======
 
-To test with the ROCm stack using a manylinux2014 container we must first mount the repository into the container:
+Test Lightning-GPU with MPI support
+-----------------------------------
+
+You may test the Python layer of the MPI enabled plugin as follows:
 
 .. code-block:: console
 
-    docker run -v `pwd`:/io -it quay.io/pypa/manylinux2014_x86_64 bash
+    mpirun -np 2 python -m pytest mpitests --tb=short
 
-Next, within the container, we install the ROCm software stack:
+The C++ code is tested with
 
 .. code-block:: console
 
-    yum install -y https://repo.radeon.com/amdgpu-install/21.40.2/rhel/7.9/amdgpu-install-21.40.2.40502-1.el7.noarch.rpm
-    amdgpu-install --usecase=hiplibsdk,rocm --no-dkms
+    rm -rf ./BuildTests
+    cmake . -BBuildTests -DBUILD_TESTS=1 -DBUILD_TESTS=1 -DENABLE_MPI=ON -DCUQUANTUM_SDK=<path to sdk>
+    cmake --build ./BuildTests --verbose
+    cd ./BuildTests
+    for file in *runner_mpi ; do mpirun -np 2 ./BuildTests/$file ; done;
 
-We next build the test suite, with a given AMD GPU target in mind, as listed `here <https://github.com/kokkos/kokkos/blob/master/Makefile.kokkos>`_.
+.. installation_LGPU-end-inclusion-marker-do-not-remove
 
-.. code-block:: console
+.. installation_LKokkos-start-inclusion-marker-do-not-remove
 
-    cd /io
-    export PATH=$PATH:/opt/rocm/bin/
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib
-    export CXX=/opt/rocm/hip/bin/hipcc
-    cmake -B build -DCMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc -DKokkos_ENABLE_HIP=ON -DPLKOKKOS_BUILD_TESTS=ON -DKokkos_ARCH_VEGA90A=ON
-    cmake --build build --verbose
+****************
+Lightning-Kokkos
+****************
 
-We may now leave the container, and run the built test suite on a machine with access to the targeted GPU.
+PyPI wheels
+===========
 
-For a system with access to the ROCm stack outside of a manylinux container, an editable ``pip`` installation can be built and installed as:
+On linux systems, `lightning.kokkos` and be installed with the OpenMP backend by providing the optional ``[kokkos]`` tag:
 
 .. code-block:: console
 
-   CMAKE_ARGS="-DKokkos_ENABLE_HIP=ON -DKokkos_ARCH_VEGA90A=ON" PL_BACKEND="lightning_kokkos" python -m pip install -e .
-
-.. installation_LKokkos-end-inclusion-marker-do-not-remove
+    $ pip install pennylane-lightning[kokkos]
 
-.. installation_LGPU-start-inclusion-marker-do-not-remove
-Lightning GPU installation
-=============================
+Installing from source
+======================
 
-Note that Lightning Qubit should be installed before Lightning-GPU.
+As Kokkos enables support for many different HPC-targeted hardware platforms, `lightning.kokkos` can be built to support any of these platforms when building from source.
 
-Lightning-GPU can be installed using ``pip``:
+We suggest first installing Kokkos with the wanted configuration following the instructions found in the `Kokkos documentation <https://kokkos.github.io/kokkos-core-wiki/building.html>`_.
+For example,
 
 .. code-block:: console
-    pip install pennylane-lightning[gpu]
-Use of Lightning-GPU also requires explicit installation of the NVIDIA cuQuantum SDK. The SDK library directory may be provided on the ``LD_LIBRARY_PATH`` environment variable, or the SDK Python package may be installed within the Python environment ``site-packages`` directory using ``pip`` or ``conda``. Please see the `cuQuantum SDK <https://developer.nvidia.com/cuquantum-sdk>`_ install guide for more information.
 
-Building from source
---------------------
+    cmake -S . -B build -G Ninja \
+        -DCMAKE_BUILD_TYPE=RelWithDebug \
+        -DCMAKE_INSTALL_PREFIX=/opt/kokkos/4.1.00/AMPERE80 \
+        -DCMAKE_CXX_STANDARD=20 \
+        -DBUILD_SHARED_LIBS:BOOL=ON \
+        -DBUILD_TESTING:BOOL=OFF \
+        -DKokkos_ENABLE_SERIAL:BOOL=ON \
+        -DKokkos_ENABLE_CUDA:BOOL=ON \
+        -DKokkos_ARCH_AMPERE80:BOOL=ON \
+        -DKokkos_ENABLE_EXAMPLES:BOOL=OFF \
+        -DKokkos_ENABLE_TESTS:BOOL=OFF \
+        -DKokkos_ENABLE_LIBDL:BOOL=OFF
+    cmake --build build && cmake --install build
+    echo export CMAKE_PREFIX_PATH=/opt/kokkos/4.1.00/AMPERE80:\$CMAKE_PREFIX_PATH
 
-To build a wheel from the package sources using the direct SDK path:
+Next, append the install location to ``CMAKE_PREFIX_PATH``.
+Note that the C++20 standard is required (``-DCMAKE_CXX_STANDARD=20`` option), and hence CUDA v12 is required for the CUDA backend.
+If an installation is not found, our builder will clone and install it during the build process.
 
-.. code-block:: console
-    PL_BACKEND="lightning_gpu" python -m pip install -e .
-To build using the PyPI/Conda installed cuQuantum package:
+The simplest way to install Lightning-Kokkos (OpenMP backend) through ``pip``.
 
 .. code-block:: console
-    cmake -BBuild -DPL_BACKEND=lightning_gpu -DCUQUANTUM_SDK=<path to sdk>
-    cmake --build ./Build --verbose
-    python -m pip install wheel
-    PL_BACKEND="lightning_gpu" python setup.py build_ext
-    PL_BACKEND="lightning_gpu" python setup.py bdist_wheel
-The built wheel can now be installed as:
 
-.. code-block:: console
-    python -m pip install ./dist/PennyLane_Lightning_GPU-*.whl
-To simplify the build, we recommend using the following containerized build process, which creates `manylinux2014 <https://github.com/pypa/manylinux>`_ compatible wheels.
+   CMAKE_ARGS="-DKokkos_ENABLE_OPENMP=ON" PL_BACKEND="lightning_kokkos" python -m pip install .
 
-To build the plugin directly with CMake:
+To build the plugin directly with CMake as above:
 
 .. code-block:: console
-   cmake -B build -DCUQUANTUM_SDK=<path to sdk> -DBUILD_TESTS=ON -DPL_BACKEND=lightning_gpu -G Ninja
+
+   cmake -B build -DKokkos_ENABLE_OPENMP=ON -DPL_BACKEND=lightning_kokkos -G Ninja
    cmake --build build
-.. installation_LGPU-end-inclusion-marker-do-not-remove
+
+The supported backend options are "SERIAL", "OPENMP", "THREADS", "HIP" and "CUDA" and the corresponding build options are ``-DKokkos_ENABLE_XXX=ON``, where ``XXX`` needs be replaced by the backend name, for instance ``OPENMP``.
+One can activate simultaneously one serial, one parallel CPU host (e.g. "OPENMP", "THREADS") and one parallel GPU device backend (e.g. "HIP", "CUDA"), but not two of any category at the same time.
+For "HIP" and "CUDA", the appropriate software stacks are required to enable compilation and subsequent use.
+Similarly, the CMake option ``-DKokkos_ARCH_{...}=ON`` must also be specified to target a given architecture.
+A list of the architectures is found on the `Kokkos wiki <https://github.com/kokkos/kokkos/wiki/Macros#architectures>`_.
+Note that "THREADS" backend is not recommended since `Kokkos <https://github.com/kokkos/kokkos-core-wiki/blob/17f08a6483937c26e14ec3c93a2aa40e4ce081ce/docs/source/ProgrammingGuide/Initialization.md?plain=1#L67>`_ does not guarantee its safety.
+
+.. installation_LKokkos-end-inclusion-marker-do-not-remove
 
 Please refer to the `plugin documentation <https://docs.pennylane.ai/projects/lightning/>`_ as
 well as to the `PennyLane documentation <https://docs.pennylane.ai/>`_ for further reference.
 
+**************
+Docker support
+**************
 
-Docker Support
---------------
-
-One can also build the Lightning image using Docker:
+Docker images for the various backends are found on the
+`PennyLane Docker Hub <https://hub.docker.com/repository/docker/pennylaneai/pennylane/general>`_ page, where there is also a detailed description about PennyLane Docker support.
+Briefly, one can build the Docker Lightning images using:
 
 .. code-block:: console
 
     $ git clone https://github.com/PennyLaneAI/pennylane-lightning.git
     $ cd pennylane-lightning
-    $ docker build -t lightning/base -f docker/Dockerfile .
+    $ docker build -f docker/Dockerfile --target ${TARGET} .
 
-Please refer to the `PennyLane installation <https://docs.pennylane.ai/en/stable/development/guide/installation.html#installation>`_ for detailed description about PennyLane Docker support.
+where ``${TARGET}`` is one of the following
 
+* wheel-lightning-qubit
+* wheel-lightning-kokkos-openmp
+* wheel-lightning-kokkos-cuda
+* wheel-lightning-gpu
+* wheel-lightning-kokkos-rocm
 
+************
 Contributing
-============
+************
 
 We welcome contributions - simply fork the repository of this plugin, and then make a
 `pull request <https://help.github.com/articles/about-pull-requests/>`_ containing your contribution.
@@ -361,9 +431,9 @@ The Python code is statically analyzed with `Pylint <https://pylint.readthedocs.
 We set up a pre-commit hook (see `Git hooks <https://git-scm.com/docs/githooks>`_) to run both of these on `git commit`.
 Please make your best effort to comply with `black` and `pylint` before using disabling pragmas (e.g. `# pylint: disable=missing-function-docstring`).
 
-
+*******
 Authors
-=======
+*******
 
 Lightning is the work of `many contributors <https://github.com/PennyLaneAI/pennylane-lightning/graphs/contributors>`_.
 
@@ -376,9 +446,9 @@ If you are doing research using PennyLane and Lightning, please cite `our paper
 
 .. support-start-inclusion-marker-do-not-remove
 
-
+*******
 Support
-=======
+*******
 
 - **Source Code:** https://github.com/PennyLaneAI/pennylane-lightning
 - **Issue Tracker:** https://github.com/PennyLaneAI/pennylane-lightning/issues
@@ -390,22 +460,25 @@ by asking a question in the forum.
 .. support-end-inclusion-marker-do-not-remove
 .. license-start-inclusion-marker-do-not-remove
 
-
+*******
 License
-=======
+*******
 
-The PennyLane lightning plugin is **free** and **open source**, released under
+The Lightning plugins are **free** and **open source**, released under
 the `Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0>`_.
+The Lightning-GPU plugin makes use of the NVIDIA cuQuantum SDK headers to
+enable the device bindings to PennyLane, which are held to their own respective license.
 
 .. license-end-inclusion-marker-do-not-remove
 .. acknowledgements-start-inclusion-marker-do-not-remove
 
+****************
 Acknowledgements
-================
+****************
 
 PennyLane Lightning makes use of the following libraries and tools, which are under their own respective licenses:
 
 - **pybind11:** https://github.com/pybind/pybind11
 - **Kokkos Core:** https://github.com/kokkos/kokkos
 
-.. acknowledgements-end-inclusion-marker-do-not-remove
\ No newline at end of file
+.. acknowledgements-end-inclusion-marker-do-not-remove

From 95044543e1206c28f2654774e08e75e83970ed73 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Tue, 24 Oct 2023 16:26:55 -0400
Subject: [PATCH 05/30] shush CI [skip ci]


From e72cd98844492f4cf70780fa4e0ce6208fcda82e Mon Sep 17 00:00:00 2001
From: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Date: Tue, 24 Oct 2023 20:28:53 +0000
Subject: [PATCH 06/30] Auto update version

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index b31d7d35ba..d276e92d29 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.33.0-dev23"
+__version__ = "0.33.0-dev24"

From 93608ee11e44cb611a5cd486edc4db07753112ca Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Tue, 24 Oct 2023 17:03:47 -0400
Subject: [PATCH 07/30] Fix README links and code-blocks. [skip ci]

---
 README.rst                    | 24 +++++++++---------------
 doc/lightning_gpu/device.rst  |  8 ++++++++
 doc/lightning_gpu/package.rst |  1 +
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/README.rst b/README.rst
index 51385272e6..5508f79e93 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,3 @@
-#################
 Lightning Plugins
 #################
 
@@ -45,7 +44,6 @@ PennyLane supports Python 3.9 and above.
 .. header-end-inclusion-marker-do-not-remove
 
 
-********
 Features
 ********
 
@@ -77,7 +75,6 @@ The following platforms are supported:
 .. installation_LQubit-start-inclusion-marker-do-not-remove
 
 
-************
 Installation
 ************
 
@@ -129,8 +126,8 @@ On MacOS, we recommend using the latest version of ``clang++`` and ``libomp``:
 
     $ brew install llvm libomp
 
-The Lightning-GPU backend has several dependencies (e.g. ``CUDA``, ``custatevec-cu11``, etc.), and hence we recommend referring to `Lightning-GPU`_ section below.
-Similarly, for Lightning-Kokkos it is recommended to configure and install Kokkos independently as prescribed in the `Lightning-Kokkos`_ section below.
+The Lightning-GPU backend has several dependencies (e.g. ``CUDA``, ``custatevec-cu11``, etc.), and hence we recommend referring to `Lightning-GPU <lightning-gpu>`_ section below.
+Similarly, for Lightning-Kokkos it is recommended to configure and install Kokkos independently as prescribed in the `Lightning-Kokkos <lightning-kokkos>`_ section below.
 
 Development installation
 ------------------------
@@ -205,7 +202,6 @@ Supported options are
 - ``-DENABLE_OPENMP:BOOL=ON``
 - ``-DENABLE_CLANG_TIDY:BOOL=ON``
 
-*******
 Testing
 *******
 
@@ -228,7 +224,8 @@ The C++ code can be tested with
 
 .. installation_LGPU-start-inclusion-marker-do-not-remove
 
-*************
+.. _lightning-gpu:
+
 Lightning-GPU
 *************
 
@@ -262,7 +259,7 @@ The wheel can then be installed with:
 
     python -m pip install ./dist/PennyLane_Lightning_GPU-*.whl
 
-To simplify the build, we recommend using the containerized build process described in section `Docker support`_.
+To simplify the build, we recommend using the containerized build process described in section `Docker support <docker-support>`_.
 
 Build Lightning-GPU with MPI
 ----------------------------
@@ -321,7 +318,8 @@ The C++ code is tested with
 
 .. installation_LKokkos-start-inclusion-marker-do-not-remove
 
-****************
+.. _lightning-kokkos:
+
 Lightning-Kokkos
 ****************
 
@@ -388,7 +386,8 @@ Note that "THREADS" backend is not recommended since `Kokkos <https://github.com
 Please refer to the `plugin documentation <https://docs.pennylane.ai/projects/lightning/>`_ as
 well as to the `PennyLane documentation <https://docs.pennylane.ai/>`_ for further reference.
 
-**************
+.. _docker-support:
+
 Docker support
 **************
 
@@ -410,7 +409,6 @@ where ``${TARGET}`` is one of the following
 * wheel-lightning-gpu
 * wheel-lightning-kokkos-rocm
 
-************
 Contributing
 ************
 
@@ -431,7 +429,6 @@ The Python code is statically analyzed with `Pylint <https://pylint.readthedocs.
 We set up a pre-commit hook (see `Git hooks <https://git-scm.com/docs/githooks>`_) to run both of these on `git commit`.
 Please make your best effort to comply with `black` and `pylint` before using disabling pragmas (e.g. `# pylint: disable=missing-function-docstring`).
 
-*******
 Authors
 *******
 
@@ -446,7 +443,6 @@ If you are doing research using PennyLane and Lightning, please cite `our paper
 
 .. support-start-inclusion-marker-do-not-remove
 
-*******
 Support
 *******
 
@@ -460,7 +456,6 @@ by asking a question in the forum.
 .. support-end-inclusion-marker-do-not-remove
 .. license-start-inclusion-marker-do-not-remove
 
-*******
 License
 *******
 
@@ -472,7 +467,6 @@ enable the device bindings to PennyLane, which are held to their own respective
 .. license-end-inclusion-marker-do-not-remove
 .. acknowledgements-start-inclusion-marker-do-not-remove
 
-****************
 Acknowledgements
 ****************
 
diff --git a/doc/lightning_gpu/device.rst b/doc/lightning_gpu/device.rst
index 41c52de564..9461ec3e9b 100644
--- a/doc/lightning_gpu/device.rst
+++ b/doc/lightning_gpu/device.rst
@@ -7,16 +7,20 @@ It extends the CPU-focused Lightning simulator to run using the NVIDIA cuQuantum
 A ``lightning.gpu`` device can be loaded using:
 
 .. code-block:: python
+
     import pennylane as qml
     dev = qml.device("lightning.gpu", wires=2)
+
 If the NVIDIA cuQuantum libraries are available, the above device will allow all operations to be performed on a CUDA capable GPU of generation SM 7.0 (Volta) and greater. If the libraries are not correctly installed, or available on path, the device will fall-back to ``lightning.qubit`` and perform all simulation on the CPU.
 
 The ``lightning.gpu`` device also directly supports quantum circuit gradients using the adjoint differentiation method. This can be enabled at the PennyLane QNode level with:
 
 .. code-block:: python
+
     qml.qnode(dev, diff_method="adjoint")
     def circuit(params):
         ...
+
 Check out the :doc:`/lightning_gpu/installation` guide for more information.
 
 Supported operations and observables
@@ -121,14 +125,18 @@ The ``lightning.gpu`` device directly supports the `adjoint differentiation meth
 If you are computing a large number of expectation values, or if you are using a large number of wires on your device, it may be best to evenly divide the number of expectation value calculations across all available GPUs. This will reduce the overall memory cost of the observables per GPU, at the cost of additional compute time. Assuming `m` observables, and `n` GPUs, the default behaviour is to pre-allocate all storage for `n` observables on a single GPU. To divide the workload amongst many GPUs, initialize a ``lightning.gpu`` device with the ``batch_obs=True`` keyword argument, as:
 
 .. code-block:: python
+
     import pennylane as qml
     dev = qml.device("lightning.gpu", wires=20, batch_obs=True)
+
 With the above, each GPU will see at most `m/n` observables to process, reducing the preallocated memory footprint.
 
 Additionally, there can be situations where even with the above distribution, and limited GPU memory, the overall problem does not fit on the requested GPU devices. You can further reduce the concurrent allocations on available GPUs by providing an integer value to the `batch_obs` keyword. For example, to batch evaluate observables with at most 1 observable allocation per GPU, define the device as:
 
 .. code-block:: python
+
     import pennylane as qml
     dev = qml.device("lightning.gpu", wires=27, batch_obs=1)
+
 Each problem is unique, so it can often be best to choose the default behaviour up-front, and tune with the above only if necessary.
  
\ No newline at end of file
diff --git a/doc/lightning_gpu/package.rst b/doc/lightning_gpu/package.rst
index 4a82eaa09e..6630d64cd8 100644
--- a/doc/lightning_gpu/package.rst
+++ b/doc/lightning_gpu/package.rst
@@ -14,5 +14,6 @@ Directly importing the device class:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: python3
+
     from pennylane_lightning.lightning_gpu import LightningGPU
 

From 9738aa0e8421432c6dd4d464e4ac6cdd534f0294 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Tue, 24 Oct 2023 17:33:54 -0400
Subject: [PATCH 08/30] Fix card links.

---
 doc/index.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/index.rst b/doc/index.rst
index 25239c62c1..08012049a0 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -21,17 +21,17 @@ The Lightning ecosystem provides the following devices:
 .. title-card::
     :name: 'lightning.qubit'
     :description: A fast state-vector qubit simulator written in C++
-    :link: lightning_qubit/device.html
+    :link: lightning_qubit/device/index.html
 
 .. title-card::
     :name: 'lightning.kokkos'
     :description: A heterogeneous backend state-vector simulator with Kokkos library support.
-    :link: lightning_kokkos/device.html
+    :link: lightning_kokkos/device/index.html
 
 .. title-card::
     :name: 'lightning.gpu'
     :description: A heterogeneous backend state-vector simulator with NVIDIA cuQuantum library support.
-    :link: lightning_gpu/device.html
+    :link: lightning_gpu/device/index.html
 
 .. raw:: html
 

From 35645e6dfc5ab84c61d0d8ce66a353873a46477e Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Tue, 24 Oct 2023 17:34:53 -0400
Subject: [PATCH 09/30] Fix obs signature to match LK/LQ

---
 doc/lightning_gpu/device.rst | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/doc/lightning_gpu/device.rst b/doc/lightning_gpu/device.rst
index 9461ec3e9b..c79424e87b 100644
--- a/doc/lightning_gpu/device.rst
+++ b/doc/lightning_gpu/device.rst
@@ -100,17 +100,19 @@ Supported operations and observables
 .. autosummary::
     :nosignatures:
 
+    ~pennylane.ops.op_math.Exp
     ~pennylane.Hadamard
+    ~pennylane.Hamiltonian
+    ~pennylane.Hermitian
     ~pennylane.Identity
     ~pennylane.PauliX
     ~pennylane.PauliY
     ~pennylane.PauliZ
-    ~pennylane.Hamiltonian
+    ~pennylane.ops.op_math.Prod
+    ~pennylane.Projector
     ~pennylane.SparseHamiltonian
-    ~pennylane.Hermitian
-    ~pennylane.Sum
-    ~pennylane.Prod
-    ~pennylane.SProd
+    ~pennylane.ops.op_math.SProd
+    ~pennylane.ops.op_math.Sum
 
 .. raw:: html
 
@@ -139,4 +141,4 @@ Additionally, there can be situations where even with the above distribution, an
     dev = qml.device("lightning.gpu", wires=27, batch_obs=1)
 
 Each problem is unique, so it can often be best to choose the default behaviour up-front, and tune with the above only if necessary.
- 
\ No newline at end of file
+ 

From 28d0d8f47604c71b174d42b9695b2a9ea8a6b021 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Tue, 24 Oct 2023 17:42:39 -0400
Subject: [PATCH 10/30] Revert card links. [skip ci]

---
 doc/index.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/index.rst b/doc/index.rst
index 08012049a0..25239c62c1 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -21,17 +21,17 @@ The Lightning ecosystem provides the following devices:
 .. title-card::
     :name: 'lightning.qubit'
     :description: A fast state-vector qubit simulator written in C++
-    :link: lightning_qubit/device/index.html
+    :link: lightning_qubit/device.html
 
 .. title-card::
     :name: 'lightning.kokkos'
     :description: A heterogeneous backend state-vector simulator with Kokkos library support.
-    :link: lightning_kokkos/device/index.html
+    :link: lightning_kokkos/device.html
 
 .. title-card::
     :name: 'lightning.gpu'
     :description: A heterogeneous backend state-vector simulator with NVIDIA cuQuantum library support.
-    :link: lightning_gpu/device/index.html
+    :link: lightning_gpu/device.html
 
 .. raw:: html
 

From 624bb58522b1e56f4c2ab9100ca8c3dd5cabdb0a Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 25 Oct 2023 16:43:12 +0000
Subject: [PATCH 11/30] update docs

---
 .github/CHANGELOG.md         |   2 +-
 doc/lightning_gpu/device.rst | 140 +++++++++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+), 1 deletion(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 3a16ff2d00..5b7b97158f 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 ### New features since last release
 
-* Add docs to `lightning_gpu` with single-GPU features.
+* Add docs to the `lightning_gpu` backend.
   [(#525)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/525)
 
 * Integrate the distributed C++ backend of Lightning-GPU into the Lightning monorepo.
diff --git a/doc/lightning_gpu/device.rst b/doc/lightning_gpu/device.rst
index c79424e87b..b280b6a50d 100644
--- a/doc/lightning_gpu/device.rst
+++ b/doc/lightning_gpu/device.rst
@@ -142,3 +142,143 @@ Additionally, there can be situations where even with the above distribution, an
 
 Each problem is unique, so it can often be best to choose the default behaviour up-front, and tune with the above only if necessary.
  
+**Multi-GPU/multi-node support:**
+
+The ``lightning.gpu`` device allows users to leverage the computational power of many GPUs sitting on separate nodes for running large-scale simulations. 
+Provided that NVIDIA ``cuQuantum`` libraries, a ``CUDA-aware MPI`` library and ``mpi4py`` are properly installed and the path to the ``libmpi.so`` is 
+added to the ``LD_LIBRARY_PATH`` environment variable, the following requirements should be met to enable multi-node and multi-GPU simulations:
+
+1. The ``mpi`` keyword argument should be set as ``True`` when initializing a ``lightning.gpu`` device.
+2. Both the total number of MPI processes and MPI processes per node must be powers of 2. For example, 2, 4, 8, 16, etc.. Each MPI process is responsible for managing one GPU. 
+
+The workflow for the multi-node/GPUs feature is as follows:
+
+.. code-block:: python
+
+    from mpi4py import MPI
+    import pennylane as qml
+    dev = qml.device('lightning.gpu', wires=8, mpi=True)
+    @qml.qnode(dev)
+    def circuit_mpi():
+        qml.PauliX(wires=[0])
+        return qml.state()
+    local_state_vector = circuit_mpi()
+
+Currently, a ``lightning.gpu`` device with the MPI multi-GPU backend supports all the ``gate operations`` and ``observables`` that a ``lightning.gpu`` device with a single GPU/node backend supports.
+
+By default, each MPI process will return the overall simulation results, except for the ``qml.state()`` and ``qml.prob()`` methods for which each MPI process only returns the local simulation
+results for the ``qml.state()`` and ``qml.prob()`` methods to avoid buffer overflow. It is the user's responsibility to ensure correct data collection for those two methods. Here are examples of collecting
+the local simulation results for ``qml.state()`` and ``qml.prob()`` methods:
+
+The workflow for collecting local state vector (using the ``qml.state()`` method) to ``rank 0`` is as follows:
+
+.. code-block:: python
+
+    from mpi4py import MPI
+    import pennylane as qml
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank() 
+    dev = qml.device('lightning.gpu', wires=8, mpi=True)
+    @qml.qnode(dev)
+    def circuit_mpi():
+        qml.PauliX(wires=[0])
+        return qml.state()
+    local_state_vector = circuit_mpi()
+    #rank 0 will collect the local state vector
+    state_vector = comm.gather(local_state_vector, root=0)
+    if rank == 0:
+        print(state_vector)
+    
+The workflow for collecting local probability (using the ``qml.prob()`` method) to ``rank 0`` is as follows:
+
+.. code-block:: python
+    
+    from mpi4py import MPI
+    import pennylane as qml
+    import numpy as np
+
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    dev = qml.device('lightning.gpu', wires=8, mpi=True)
+    prob_wires = [0, 1]
+
+    @qml.qnode(dev)
+    def mpi_circuit():
+        qml.Hadamard(wires=1)
+        return qml.probs(wires=prob_wires)
+
+    local_probs = mpi_circuit()
+ 
+    #For data collection across MPI processes.
+    recv_counts = comm.gather(len(local_probs),root=0)
+    if rank == 0:
+        probs = np.zeros(2**len(prob_wires))
+    else:
+        probs = None
+
+    comm.Gatherv(local_probs,[probs,recv_counts],root=0)
+    if rank == 0:
+        print(probs)
+
+Then the python script can be executed with the following command:
+
+.. code-block:: console
+    
+    $ mpirun -np 4 python yourscript.py
+
+Furthermore, users can optimize the performance of their applications by allocating the appropriate amount of GPU memory for MPI operations with the ``mpi_buf_size`` keyword argument. To allocate ``n`` mebibytes (MiB, `2^20` bytes) of GPU memory for MPI operations, initialize a ``lightning.gpu`` device with the ``mpi_buf_size=n`` keyword argument, as follows:
+
+.. code-block:: python
+
+    from mpi4py import MPI
+    import pennylane as qml
+    n = 8
+    dev = qml.device("lightning.gpu", wires=20, mpi=True, mpi_buf_size=n)
+
+Note the value of ``mpi_buf_size`` should also be a power of ``2``. Remember to carefully manage the ``mpi_buf_size`` parameter, taking into account the available GPU memory and the memory 
+requirements of the local state vector, to prevent memory overflow issues and ensure optimal performance. By default (``mpi_buf_size=0``), the GPU memory allocated for MPI operations 
+will match the size of the local state vector, with a limit of ``64 MiB``. Please be aware that a runtime warning will occur if the local GPU memory buffer for MPI operations exceeds
+the GPU memory allocated to the local state vector.
+
+**Multi-GPU/multi-node support for adjoint method:**
+
+The ``lightning.gpu`` device with the multi-GPU/multi-node backend also directly supports the `adjoint differentiation method <https://pennylane.ai/qml/demos/tutorial_adjoint_diff.html>`__. Instead of batching observables across the multiple GPUs available within a node, the state vector is distributed among the available GPUs with the multi-GPU/multi-node backend.
+By default, the adjoint method with MPI support follows the performance-oriented implementation of the single GPU backend. This means that a separate ``bra`` is created for each observable and the ``ket`` is updated only once for each operation, regardless of the number of observables.
+
+The workflow for the default adjoint method with MPI support is as follows:
+
+.. code-block:: python
+    
+    from mpi4py import MPI
+    import pennylane as qml
+    from pennylane import numpy as np
+  
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    n_wires = 20
+    n_layers = 2
+  
+    dev = qml.device('lightning.gpu', wires= n_wires, mpi=True)
+    @qml.qnode(dev, diff_method="adjoint")
+    def circuit_adj(weights):
+        qml.StronglyEntanglingLayers(weights, wires=list(range(n_wires)))
+        return qml.math.hstack([qml.expval(qml.PauliZ(i)) for i in range(n_wires)])
+  
+    if rank == 0:
+        params = np.random.random(qml.StronglyEntanglingLayers.shape(n_layers=n_layers, n_wires=n_wires))
+    else:
+        params = None
+  
+    params = comm.bcast(params, root=0)
+    jac = qml.jacobian(circuit_adj)(params)
+
+If users aim to handle larger system sizes with limited hardware resources, the memory-optimized adjoint method with MPI support is more appropriate. The memory-optimized adjoint method with MPI support employs a single ``bra`` object that is reused for all observables.
+This approach results in a notable reduction in the required GPU memory when dealing with a large number of observables. However, it's important to note that the reduction in memory requirement may come at the expense of slower execution due to the multiple ``ket`` updates per gate operation.
+
+To enable the memory-optimized adjoint method with MPI support, ``batch_obs`` should be set as ``True`` and the workflow follows:
+
+.. code-block:: python
+    
+    dev = qml.device('lightning.gpu', wires= n_wires, mpi=True, batch_obs=True)
+
+For the adjoint method, each MPI process will provide the overall simulation results.
\ No newline at end of file

From 7197dca2bc16442a4189a5386d72bbe0c2b1b15e Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 25 Oct 2023 16:43:59 +0000
Subject: [PATCH 12/30] update readme

---
 README.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.rst b/README.rst
index 5508f79e93..8fefa8a017 100644
--- a/README.rst
+++ b/README.rst
@@ -474,5 +474,6 @@ PennyLane Lightning makes use of the following libraries and tools, which are un
 
 - **pybind11:** https://github.com/pybind/pybind11
 - **Kokkos Core:** https://github.com/kokkos/kokkos
+- **NVIDIA cuQuantum:** https://developer.nvidia.com/cuquantum-sdk
 
 .. acknowledgements-end-inclusion-marker-do-not-remove

From 7f512ba1b311fbc30d8bf833474f62f8ba8a8ae6 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Wed, 25 Oct 2023 14:24:21 -0400
Subject: [PATCH 13/30] Reorder cards and add docker support section. [skip ci]

---
 README.rst           | 19 ++++++++++++-------
 doc/index.rst        | 13 +++++++------
 doc/installation.rst |  9 ++++-----
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/README.rst b/README.rst
index 8fefa8a017..380d736863 100644
--- a/README.rst
+++ b/README.rst
@@ -41,7 +41,6 @@ The Lightning plugin ecosystem provides fast state-vector simulators written in
 learning, automatic differentiation, and optimization of hybrid quantum-classical computations.
 PennyLane supports Python 3.9 and above.
 
-.. header-end-inclusion-marker-do-not-remove
 
 
 Features
@@ -53,7 +52,9 @@ PennyLane-Lightning high performance simulators include the following backends:
 * ``lightning.gpu``: is a state-vector simulator based on the `NVIDIA cuQuantum SDK <https://developer.nvidia.com/cuquantum-sdk>`_. It notably implements a distributed state-vector simulator based on MPI.
 * ``lightning.kokkos``: is a state-vector simulator written with `Kokkos <https://kokkos.github.io/kokkos-core-wiki/index.html>`_. It can exploit the inherent parallelism of modern processing units supporting the `OpenMP <https://www.openmp.org/>`_, `CUDA <https://developer.nvidia.com/cuda-toolkit>`_ or `HIP <https://docs.amd.com/projects/HIP/en/docs-5.3.0/index.html>`_ programming models.
 
-The following platforms are supported:
+.. header-end-inclusion-marker-do-not-remove
+
+The following table summarizes the supported platforms and the primary installation mode:
 
 +-----------+---------+--------+-------------+----------------+-----------------+----------------+
 |           | L-Qubit | L-GPU  | L-GPU (MPI) | L-Kokkos (OMP) | L-Kokkos (CUDA) | L-Kokkos (HIP) |
@@ -386,6 +387,8 @@ Note that "THREADS" backend is not recommended since `Kokkos <https://github.com
 Please refer to the `plugin documentation <https://docs.pennylane.ai/projects/lightning/>`_ as
 well as to the `PennyLane documentation <https://docs.pennylane.ai/>`_ for further reference.
 
+.. docker-start-inclusion-marker-do-not-remove
+
 .. _docker-support:
 
 Docker support
@@ -403,11 +406,13 @@ Briefly, one can build the Docker Lightning images using:
 
 where ``${TARGET}`` is one of the following
 
-* wheel-lightning-qubit
-* wheel-lightning-kokkos-openmp
-* wheel-lightning-kokkos-cuda
-* wheel-lightning-gpu
-* wheel-lightning-kokkos-rocm
+* ``wheel-lightning-qubit``
+* ``wheel-lightning-gpu``
+* ``wheel-lightning-kokkos-openmp``
+* ``wheel-lightning-kokkos-cuda``
+* ``wheel-lightning-kokkos-rocm``
+
+.. docker-end-inclusion-marker-do-not-remove
 
 Contributing
 ************
diff --git a/doc/index.rst b/doc/index.rst
index 25239c62c1..695c3bb9d0 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -23,16 +23,16 @@ The Lightning ecosystem provides the following devices:
     :description: A fast state-vector qubit simulator written in C++
     :link: lightning_qubit/device.html
 
-.. title-card::
-    :name: 'lightning.kokkos'
-    :description: A heterogeneous backend state-vector simulator with Kokkos library support.
-    :link: lightning_kokkos/device.html
-
 .. title-card::
     :name: 'lightning.gpu'
     :description: A heterogeneous backend state-vector simulator with NVIDIA cuQuantum library support.
     :link: lightning_gpu/device.html
 
+.. title-card::
+    :name: 'lightning.kokkos'
+    :description: A heterogeneous backend state-vector simulator with Kokkos library support.
+    :link: lightning_kokkos/device.html
+
 .. raw:: html
 
     <div style='clear:both'></div>
@@ -44,6 +44,7 @@ The Lightning ecosystem provides the following devices:
    :hidden:
 
    installation
+   docker
    support
 
 .. toctree::
@@ -52,8 +53,8 @@ The Lightning ecosystem provides the following devices:
    :hidden:
 
    lightning_qubit/device
-   lightning_kokkos/device
    lightning_gpu/device
+   lightning_kokkos/device
 
 .. toctree::
    :maxdepth: 2
diff --git a/doc/installation.rst b/doc/installation.rst
index 16300d2a13..e31963aff9 100644
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -8,17 +8,16 @@ Each device in the Lightning ecosystem is a separate Python package. Select the
    :description: Guidelines to installing and testing the Lightning Qubit device.
    :link: ./lightning_qubit/installation.html
 
+.. title-card::
+   :name: Lightning GPU
+   :description: Guidelines to installing and testing the Lightning GPU device
+   :link: ./lightning_gpu/installation.html
 
 .. title-card::
    :name: Lightning Kokkos
    :description: Guidelines to installing and testing the Lightning Kokkos device
    :link: ./lightning_kokkos/installation.html
 
-.. title-card::
-   :name: Lightning GPU
-   :description: Guidelines to installing and testing the Lightning GPU device
-   :link: ./lightning_gpu/installation.html
-
 .. raw:: html
 
         <div style='clear:both'></div>

From 50a3f882a4615c947efade30daca11912c016615 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Wed, 25 Oct 2023 14:25:20 -0400
Subject: [PATCH 14/30] Build with CUDA on the CI for correct API gen

---
 .readthedocs.yml     | 8 +++++++-
 doc/requirements.txt | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 00a2f7e4b8..9ffd82d5ec 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -21,8 +21,14 @@ build:
     - libopenblas-base
     - libopenblas-dev
     - graphviz
+    - cuda
   jobs:
     pre_install:
       - echo "setuptools~=66.0\npip~=22.0" >> ci_build_requirements.txt
+      - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
+      - dpkg -i cuda-keyring_1.0-1_all.deb
+      - apt-get update
     post_install:
-      - PL_BACKEND="lightning_kokkos" pip install -e . -vv
+      - rm -rf ./build && PL_BACKEND="lightning_kokkos" python setup.py bdist_wheel
+      - rm -rf ./build && PATH=$PATH:/usr/local/cuda/bin PL_BACKEND="lightning_GPU" python setup.py bdist_wheel
+      - python -m pip install ./dist/*.whl
diff --git a/doc/requirements.txt b/doc/requirements.txt
index 3acc41f79a..f6d4fac3d1 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -6,3 +6,5 @@ pybind11
 sphinx
 sphinx-automodapi
 pennylane-sphinx-theme
+custatevec-cu11
+wheel

From c9ba0bdb6762aa52ee4e6f18d65afecae95f4511 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Wed, 25 Oct 2023 14:28:22 -0400
Subject: [PATCH 15/30] Add docker.rst [skip ci].

---
 doc/docker.rst | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 doc/docker.rst

diff --git a/doc/docker.rst b/doc/docker.rst
new file mode 100644
index 0000000000..85ae81ba73
--- /dev/null
+++ b/doc/docker.rst
@@ -0,0 +1,3 @@
+.. include:: ../README.rst
+  :start-after:	docker-start-inclusion-marker-do-not-remove
+  :end-before: docker-end-inclusion-marker-do-not-remove

From 05ce142b12c95292b334d771ab0bebe5914ef2f6 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Wed, 25 Oct 2023 15:10:54 -0400
Subject: [PATCH 16/30] Add Cuda 11.8 install

---
 .readthedocs.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 9ffd82d5ec..8f4e3c044e 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -21,13 +21,12 @@ build:
     - libopenblas-base
     - libopenblas-dev
     - graphviz
-    - cuda
   jobs:
     pre_install:
       - echo "setuptools~=66.0\npip~=22.0" >> ci_build_requirements.txt
       - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
       - dpkg -i cuda-keyring_1.0-1_all.deb
-      - apt-get update
+      - apt-get update && apt install cuda-11.8
     post_install:
       - rm -rf ./build && PL_BACKEND="lightning_kokkos" python setup.py bdist_wheel
       - rm -rf ./build && PATH=$PATH:/usr/local/cuda/bin PL_BACKEND="lightning_GPU" python setup.py bdist_wheel

From 0ffed73132535849d4850e6ba52f57ccbb1acc04 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Wed, 25 Oct 2023 15:14:35 -0400
Subject: [PATCH 17/30] Lower CUDA version

---
 .readthedocs.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 8f4e3c044e..8fb027ba4b 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -21,12 +21,10 @@ build:
     - libopenblas-base
     - libopenblas-dev
     - graphviz
+    - nvidia-cuda-toolkit
   jobs:
     pre_install:
       - echo "setuptools~=66.0\npip~=22.0" >> ci_build_requirements.txt
-      - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
-      - dpkg -i cuda-keyring_1.0-1_all.deb
-      - apt-get update && apt install cuda-11.8
     post_install:
       - rm -rf ./build && PL_BACKEND="lightning_kokkos" python setup.py bdist_wheel
       - rm -rf ./build && PATH=$PATH:/usr/local/cuda/bin PL_BACKEND="lightning_GPU" python setup.py bdist_wheel

From 2e983a7fa0e31924690813829c78e97cbdda0a77 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Wed, 25 Oct 2023 15:15:44 -0400
Subject: [PATCH 18/30] Fix typo in name and paths

---
 .readthedocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 8fb027ba4b..9c9a5c96df 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -27,5 +27,5 @@ build:
       - echo "setuptools~=66.0\npip~=22.0" >> ci_build_requirements.txt
     post_install:
       - rm -rf ./build && PL_BACKEND="lightning_kokkos" python setup.py bdist_wheel
-      - rm -rf ./build && PATH=$PATH:/usr/local/cuda/bin PL_BACKEND="lightning_GPU" python setup.py bdist_wheel
+      - rm -rf ./build && PL_BACKEND="lightning_gpu" python setup.py bdist_wheel
       - python -m pip install ./dist/*.whl

From 1914de3a8c348416224a6872222020bb0aea4742 Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Wed, 25 Oct 2023 15:41:34 -0400
Subject: [PATCH 19/30] Disable CUDA checks for RTD

---
 .readthedocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 9c9a5c96df..89e22be5f4 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -27,5 +27,5 @@ build:
       - echo "setuptools~=66.0\npip~=22.0" >> ci_build_requirements.txt
     post_install:
       - rm -rf ./build && PL_BACKEND="lightning_kokkos" python setup.py bdist_wheel
-      - rm -rf ./build && PL_BACKEND="lightning_gpu" python setup.py bdist_wheel
+      - rm -rf ./build && PL_BACKEND="lightning_gpu" python setup.py bdist_wheel --define="PL_DISABLE_CUDA_SAFETY=1"
       - python -m pip install ./dist/*.whl

From 9f086da8e6f4b67fce223f03b475dae0890acc1b Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 25 Oct 2023 17:41:07 +0000
Subject: [PATCH 20/30] update readme

---
 README.rst | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.rst b/README.rst
index 380d736863..136dced6f3 100644
--- a/README.rst
+++ b/README.rst
@@ -293,6 +293,22 @@ The wheel can then be installed as:
 
     python -m pip install ./dist/PennyLane_Lightning_GPU-*.whl
 
+The simplest way to install Lightning-GPU with MPI support through ``pip``. Lightning-Qubit is required to be 
+installed before Lightning-GPU.
+.. code-block:: console
+
+   PL_BACKEND="lightning_qubit" python -m pip install -e .
+
+Then we need to set an environment variable `CUQUANTUM_SDK` with the path to `cuStateVec` as follows:
+
+.. code-block:: console
+
+    export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum/lib')")
+
+.. code-block:: console
+
+   CMAKE_ARGS="-DENABLE_MPI=ON" PL_BACKEND="lightning_gpu" python -m pip install -e .
+
 Testing
 =======
 

From c09a745a335001041c874d06dddeceb07d8e90c2 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 25 Oct 2023 19:53:42 +0000
Subject: [PATCH 21/30] update LGPU installation steps

---
 README.rst | 52 ++++++++++++++++------------------------------------
 1 file changed, 16 insertions(+), 36 deletions(-)

diff --git a/README.rst b/README.rst
index 136dced6f3..ee05b9ab95 100644
--- a/README.rst
+++ b/README.rst
@@ -246,19 +246,27 @@ Please see the `cuQuantum SDK <https://developer.nvidia.com/cuquantum-sdk>`_ ins
 Installing from source
 ======================
 
-To build a wheel from the package sources using the direct SDK path:
+To install Lightning-GPU from the package sources using the direct SDK path, Lightning-Qubit should be install before Lightning-GPU:
+
+.. code-block:: console
+
+    git clone https://github.com/PennyLaneAI/pennylane-lightning.git
+    cd pennylane-lightning
+    pip install -r requirements.txt
+    PL_BACKEND="lightning_qubit" pip install -e . -vv
+
+Then the `cuStateVec <https://docs.nvidia.com/cuda/cuquantum/latest/custatevec/index.html>`_ library can be installed and set a ``CUQUANTUM_SDK`` environment variable.
 
 .. code-block:: console
 
     python -m pip install wheel custatevec-cu11
-    python setup.py build_ext --cuquantum=<path to sdk>
-    python setup.py bdist_wheel
+    export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum/lib')")
 
-The wheel can then be installed with:
+The Lightning-GPU can then be installed with ``pip``:
 
 .. code-block:: console
 
-    python -m pip install ./dist/PennyLane_Lightning_GPU-*.whl
+    PL_BACKEND="lightning_gpu" python -m pip install -e .
 
 To simplify the build, we recommend using the containerized build process described in section `Docker support <docker-support>`_.
 
@@ -273,41 +281,13 @@ It is recommended to install the ``NVIDIA cuQuantum SDK`` and ``mpi4py`` Python
 Please consult the `cuQuantum SDK <https://developer.nvidia.com/cuquantum-sdk>`_ , `mpi4py <https://mpi4py.readthedocs.io/en/stable/install.html>`_,
 `MPICH <https://www.mpich.org/static/downloads/4.1.1/mpich-4.1.1-README.txt>`_, or `OpenMPI <https://www.open-mpi.org/faq/?category=buildcuda>`_ install guide for more information.
 
-To build a wheel with MPI support using the direct SDK path:
-
-.. code-block:: console
-
-    python -m pip install wheel 
-    python setup.py build_ext --define="ENABLE_MPI=ON" --cuquantum=<path to sdk>
-    python setup.py bdist_wheel
-
-If cuStateVec is installed with ``pip``, you may set ``--cuquantum=${CUQUANTUM_SDK}`` where
-
-.. code-block:: console
-
-    export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum/lib')")
-
-The wheel can then be installed as:
-
-.. code-block:: console
-
-    python -m pip install ./dist/PennyLane_Lightning_GPU-*.whl
-
-The simplest way to install Lightning-GPU with MPI support through ``pip``. Lightning-Qubit is required to be 
-installed before Lightning-GPU.
-.. code-block:: console
-
-   PL_BACKEND="lightning_qubit" python -m pip install -e .
-
-Then we need to set an environment variable `CUQUANTUM_SDK` with the path to `cuStateVec` as follows:
+Before installing Lightning-GPU with MPI support using the direct SDK path, please ensure Lightning-Qubit, ``CUDA-aware MPI`` and ``custatevec`` are installed and the environment variable ``CUQUANTUM_SDK`` is set properly.
+Then Lightning-GPU with MPI support can then be installed with ``pip``:
 
 .. code-block:: console
 
-    export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum/lib')")
-
-.. code-block:: console
+    CMAKE_ARGS="-DENABLE_MPI=ON"  PL_BACKEND="lightning_gpu" python -m pip install -e .
 
-   CMAKE_ARGS="-DENABLE_MPI=ON" PL_BACKEND="lightning_gpu" python -m pip install -e .
 
 Testing
 =======

From 4cf07743678a6b80eba7426b80a28b7ff8386493 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Wed, 25 Oct 2023 16:04:12 -0400
Subject: [PATCH 22/30] Turn off GPU runners.

---
 .github/workflows/tests_gpu_cu11.yml          | 2 +-
 .github/workflows/tests_gpu_kokkos.yml        | 2 +-
 .github/workflows/tests_linux_x86_mpi_gpu.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests_gpu_cu11.yml b/.github/workflows/tests_gpu_cu11.yml
index 1593c1b15d..3948090b57 100644
--- a/.github/workflows/tests_gpu_cu11.yml
+++ b/.github/workflows/tests_gpu_cu11.yml
@@ -1,6 +1,6 @@
 name: Testing::Linux::x86_64::LGPU
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/tests_gpu_kokkos.yml b/.github/workflows/tests_gpu_kokkos.yml
index e65aa0697d..c6f7820f81 100644
--- a/.github/workflows/tests_gpu_kokkos.yml
+++ b/.github/workflows/tests_gpu_kokkos.yml
@@ -1,6 +1,6 @@
 name: Testing::LKokkos::GPU
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/tests_linux_x86_mpi_gpu.yml b/.github/workflows/tests_linux_x86_mpi_gpu.yml
index e18a8f13e0..9d3edfe913 100644
--- a/.github/workflows/tests_linux_x86_mpi_gpu.yml
+++ b/.github/workflows/tests_linux_x86_mpi_gpu.yml
@@ -14,7 +14,7 @@ on:
   push:
     branches:
       - main
-  pull_request:
+  #pull_request:
 
 env:
   COVERAGE_FLAGS: "--cov=pennylane_lightning --cov-report=term-missing --cov-report=xml:./coverage.xml --no-flaky-report -p no:warnings --tb=native"

From 1faa8110ba4c477139a2402bb3c14dfcf53e66bb Mon Sep 17 00:00:00 2001
From: "Lee J. O'Riordan" <lee@xanadu.ai>
Date: Wed, 25 Oct 2023 16:05:03 -0400
Subject: [PATCH 23/30] Update CUDA wheel builder

---
 .readthedocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 89e22be5f4..e4d85ee56b 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -27,5 +27,5 @@ build:
       - echo "setuptools~=66.0\npip~=22.0" >> ci_build_requirements.txt
     post_install:
       - rm -rf ./build && PL_BACKEND="lightning_kokkos" python setup.py bdist_wheel
-      - rm -rf ./build && PL_BACKEND="lightning_gpu" python setup.py bdist_wheel --define="PL_DISABLE_CUDA_SAFETY=1"
+      - rm -rf ./build && PL_BACKEND="lightning_gpu" python setup.py build_ext --define="PL_DISABLE_CUDA_SAFETY=1" && PL_BACKEND="lightning_gpu" python setup.py bdist_wheel
       - python -m pip install ./dist/*.whl

From 7ae4cec09d2b4f8f6c4c797bd767cde9ea918981 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Wed, 25 Oct 2023 20:51:42 +0000
Subject: [PATCH 24/30] add docstring in lightning_gpu.py

---
 .../lightning_gpu/lightning_gpu.py            | 93 ++++++++++++++++++-
 1 file changed, 90 insertions(+), 3 deletions(-)

diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 98de0e9512..2512d6341a 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -205,8 +205,16 @@ def _mebibytesToBytes(mebibytes):
 
     class LightningGPU(LightningBase):  # pylint: disable=too-many-instance-attributes
         """PennyLane-Lightning-GPU device.
+
+        A GPU-backed Lightning device using NVIDIA cuQuantum SDK.
+
+        Use of this device requires pre-built binaries or compilation from source. Check out the
+        :doc:`/lightning_gpu/installation` guide for more details.
+
         Args:
             wires (int): the number of wires to initialize the device with
+            mpi (bool): enable MPI support. MPI support will be enabled if ``mpi`` is set as``True``.
+            mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
             sync (bool): immediately sync with host-sv after applying operations
             c_dtype: Datatypes for statevector representation. Must be one of ``np.complex64`` or ``np.complex128``.
             shots (int): How many times the circuit should be evaluated (or sampled) to estimate
@@ -216,7 +224,7 @@ class LightningGPU(LightningBase):  # pylint: disable=too-many-instance-attribut
             batch_obs (Union[bool, int]): determine whether to use multiple GPUs within the same node or not
         """
 
-        name = "PennyLane plugin for GPU-backed Lightning device using NVIDIA cuQuantum SDK"
+        name = "Lightning GPU PennyLane plugin"
         short_name = "lightning.gpu"
 
         operations = allowed_operations
@@ -283,6 +291,7 @@ def __init__(
             self._create_basis_state(0)
 
         def _mpi_init_helper(self, num_wires):
+            """Set up MPI checks."""
             if not MPI_SUPPORT:
                 raise ImportError("MPI related APIs are not found.")
             # initialize MPIManager and config check in the MPIManager ctor
@@ -545,6 +554,7 @@ def apply_lightning(self, operations):
 
         # pylint: disable=unused-argument
         def apply(self, operations, rotations=None, **kwargs):
+            """Applies a list of operations to the state tensor."""
             # State preparation is currently done in Python
             if operations:  # make sure operations[0] exists
                 if isinstance(operations[0], StatePrep):
@@ -635,6 +645,12 @@ def _init_process_jacobian_tape(self, tape, starting_state, use_device_state):
             return self._gpu_state
 
         def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
+            """Implements the adjoint method outlined in
+            `Jones and Gacon <https://arxiv.org/abs/2009.02823>`__ to differentiate an input tape.
+
+            After a forward pass, the circuit is reversed by iteratively applying adjoint
+            gates to scan backwards through the circuit.
+            """
             if self.shots is not None:
                 warn(
                     "Requested adjoint differentiation to be computed with finite shots."
@@ -697,7 +713,42 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
 
         # pylint: disable=inconsistent-return-statements, line-too-long, missing-function-docstring
         def vjp(self, measurements, grad_vec, starting_state=None, use_device_state=False):
-            """Generate the processing function required to compute the vector-Jacobian products of a tape."""
+            """Generate the processing function required to compute the vector-Jacobian products
+            of a tape.
+
+            This function can be used with multiple expectation values or a quantum state.
+            When a quantum state is given,
+
+            .. code-block:: python
+
+                vjp_f = dev.vjp([qml.state()], grad_vec)
+                vjp = vjp_f(tape)
+
+            computes :math:`w = (w_1,\\cdots,w_m)` where
+
+            .. math::
+
+                w_k = \\langle v| \\frac{\\partial}{\\partial \\theta_k} | \\psi_{\\pmb{\\theta}} \\rangle.
+
+            Here, :math:`m` is the total number of trainable parameters,
+            :math:`\\pmb{\\theta}` is the vector of trainable parameters and
+            :math:`\\psi_{\\pmb{\\theta}}` is the output quantum state.
+
+            Args:
+                measurements (list): List of measurement processes for vector-Jacobian product.
+                    Now it must be expectation values or a quantum state.
+                grad_vec (tensor_like): Gradient-output vector. Must have shape matching the output
+                    shape of the corresponding tape, i.e. number of measurements if the return
+                    type is expectation or :math:`2^N` if the return type is statevector
+                starting_state (tensor_like): post-forward pass state to start execution with.
+                    It should be complex-valued. Takes precedence over ``use_device_state``.
+                use_device_state (bool): use current device state to initialize.
+                    A forward pass of the same circuit should be the last thing the device
+                    has executed. If a ``starting_state`` is provided, that takes precedence.
+
+            Returns:
+                The processing function required to compute the vector-Jacobian products of a tape.
+            """
             if self.shots is not None:
                 warn(
                     "Requested adjoint differentiation to be computed with finite shots."
@@ -742,6 +793,7 @@ def processing_fn(tape):
 
         # pylint: disable=attribute-defined-outside-init
         def sample(self, observable, shot_range=None, bin_size=None, counts=False):
+            """Return samples of an observable."""
             if observable.name != "PauliZ":
                 self.apply_lightning(observable.diagonalizing_gates())
                 self._samples = self.generate_samples()
@@ -763,6 +815,19 @@ def generate_samples(self):
 
         # pylint: disable=protected-access, missing-function-docstring
         def expval(self, observable, shot_range=None, bin_size=None):
+            """Expectation value of the supplied observable.
+
+            Args:
+                observable: A PennyLane observable.
+                shot_range (tuple[int]): 2-tuple of integers specifying the range of samples
+                    to use. If not specified, all samples are used.
+                bin_size (int): Divides the shot range into bins of size ``bin_size``, and
+                    returns the measurement statistic separately over each bin. If not
+                    provided, the entire shot range is treated as a single bin.
+
+            Returns:
+                Expectation value of the observable
+            """
             if self.shots is not None:
                 # estimate the expectation value
                 samples = self.sample(observable, shot_range=shot_range, bin_size=bin_size)
@@ -814,6 +879,15 @@ def expval(self, observable, shot_range=None, bin_size=None):
             return self.measurements.expval(observable.name, observable_wires)
 
         def probability_lightning(self, wires=None):
+            """Return the probability of each computational basis state.
+
+            Args:
+                wires (Iterable[Number, str], Number, str, Wires): wires to return
+                    marginal probabilities for. Wires not provided are traced out of the system.
+
+            Returns:
+                array[float]: list of the probabilities
+            """
             # translate to wire labels used by device
             observable_wires = self.map_wires(wires)
             # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
@@ -825,6 +899,19 @@ def probability_lightning(self, wires=None):
 
         # pylint: disable=missing-function-docstring
         def var(self, observable, shot_range=None, bin_size=None):
+            """Variance of the supplied observable.
+
+            Args:
+                observable: A PennyLane observable.
+                shot_range (tuple[int]): 2-tuple of integers specifying the range of samples
+                    to use. If not specified, all samples are used.
+                bin_size (int): Divides the shot range into bins of size ``bin_size``, and
+                    returns the measurement statistic separately over each bin. If not
+                    provided, the entire shot range is treated as a single bin.
+
+            Returns:
+                Variance of the observable
+            """
             if self.shots is not None:
                 # estimate the var
                 # Lightning doesn't support sampling yet
@@ -858,7 +945,7 @@ def var(self, observable, shot_range=None, bin_size=None):
 
     class LightningGPU(LightningBaseFallBack):  # pragma: no cover
         # pylint: disable=missing-class-docstring, too-few-public-methods
-        name = "PennyLane plugin for GPU-backed Lightning device using NVIDIA cuQuantum SDK: [No binaries found - Fallback: default.qubit]"
+        name = "Lightning GPU PennyLane plugin: [No binaries found - Fallback: default.qubit]"
         short_name = "lightning.gpu"
 
         def __init__(self, wires, *, c_dtype=np.complex128, **kwargs):

From ef353b61d23df1b1d71fcdb8c8cb863108415f16 Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Thu, 26 Oct 2023 12:10:57 -0400
Subject: [PATCH 25/30] Change kokkos gpu order. [skip ci]

---
 doc/code/__init__.rst | 11 +++++------
 doc/installation.rst  |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/doc/code/__init__.rst b/doc/code/__init__.rst
index 61d4a44caf..b77d3c91a3 100644
--- a/doc/code/__init__.rst
+++ b/doc/code/__init__.rst
@@ -18,17 +18,16 @@ This section contains the API documentation for the Lightning packages.
    :description: API documentation for the lightning_qubit package
    :link: ../lightning_qubit/package.html
 
+.. title-card::
+   :name: lightning_gpu
+   :description: API documentation for the lightning_gpu package
+   :link: ../lightning_gpu/package.html
 
 .. title-card::
    :name: lightning_kokkos
    :description: API documentation for the lightning_kokkos package
    :link: ../lightning_kokkos/package.html
 
-.. title-card::
-   :name: lightning_gpu
-   :description: API documentation for the lightning_gpu package
-   :link: ../lightning_gpu/package.html
-
 .. raw:: html
 
         <div style='clear:both'></div>
@@ -38,5 +37,5 @@ This section contains the API documentation for the Lightning packages.
    :hidden:
 
    ../lightning_qubit/package
-   ../lightning_kokkos/package
    ../lightning_gpu/package
+   ../lightning_kokkos/package
diff --git a/doc/installation.rst b/doc/installation.rst
index e31963aff9..c0b056f5c0 100644
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -27,5 +27,5 @@ Each device in the Lightning ecosystem is a separate Python package. Select the
    :hidden:
 
    lightning_qubit/installation
-   lightning_kokkos/installation
    lightning_gpu/installation
+   lightning_kokkos/installation

From 762ed7e02b5a5adeb4c0e4f474150c1799b68e9f Mon Sep 17 00:00:00 2001
From: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Date: Thu, 26 Oct 2023 13:54:43 -0400
Subject: [PATCH 26/30] Fix some headings and toctrees [skip ci].

---
 README.rst                                    | 64 +++++++------------
 doc/code/__init__.rst                         |  4 +-
 doc/index.rst                                 |  2 +-
 doc/lightning_gpu/device.rst                  |  2 +-
 doc/lightning_qubit/development/index.rst     |  2 +-
 .../lightning_gpu/lightning_gpu.py            |  2 +-
 6 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/README.rst b/README.rst
index ee05b9ab95..627fe32383 100644
--- a/README.rst
+++ b/README.rst
@@ -41,8 +41,6 @@ The Lightning plugin ecosystem provides fast state-vector simulators written in
 learning, automatic differentiation, and optimization of hybrid quantum-classical computations.
 PennyLane supports Python 3.9 and above.
 
-
-
 Features
 ********
 
@@ -75,15 +73,11 @@ The following table summarizes the supported platforms and the primary installat
 
 .. installation_LQubit-start-inclusion-marker-do-not-remove
 
+Lightning-Qubit installation
+****************************
 
-Installation
-************
-
-Pip installation
-================
-
-PyPI wheels
------------
+PyPI wheels (pip)
+=================
 
 Lightning plugins can be installed using ``pip`` as follows
 
@@ -100,8 +94,8 @@ In order to install the Lightning-GPU and Lightning-Kokkos (OpenMP) backends, yo
     $ pip install pennylane-lightning[kokkos]
 
 
-Installing from source
-----------------------
+Install from source
+===================
 
 To build Lightning plugins from source you can run
 
@@ -131,7 +125,7 @@ The Lightning-GPU backend has several dependencies (e.g. ``CUDA``, ``custatevec-
 Similarly, for Lightning-Kokkos it is recommended to configure and install Kokkos independently as prescribed in the `Lightning-Kokkos <lightning-kokkos>`_ section below.
 
 Development installation
-------------------------
+========================
 
 For development and testing, you can install by cloning the repository:
 
@@ -160,8 +154,8 @@ or with ``build_ext`` and the ``--define`` flag as follows:
 
 where ``-D`` must not be included before ``;``-separated options.
 
-Compile on Windows with MSVC
-----------------------------
+Compile MSVC (Windows)
+======================
 
 Lightning-Qubit can be compiled on Windows using the
 `Microsoft Visual C++ <https://visualstudio.microsoft.com/vs/features/cplusplus/>`_ compiler.
@@ -204,7 +198,7 @@ Supported options are
 - ``-DENABLE_CLANG_TIDY:BOOL=ON``
 
 Testing
-*******
+=======
 
 To test that a plugin is working correctly, test the Python code with:
 
@@ -220,18 +214,14 @@ The C++ code can be tested with
 
     $ PL_BACKEND=${PL_BACKEND} make test-cpp
 
-
 .. installation_LQubit-end-inclusion-marker-do-not-remove
 
 .. installation_LGPU-start-inclusion-marker-do-not-remove
 
 .. _lightning-gpu:
 
-Lightning-GPU
-*************
-
-PyPI wheels
-===========
+Lightning-GPU installation
+**************************
 
 Lightning-GPU can be installed using ``pip``:
 
@@ -243,8 +233,8 @@ Lightning-GPU requires the `cuQuantum SDK <https://developer.nvidia.com/cuquantu
 The SDK may be installed within the Python environment ``site-packages`` directory using ``pip`` or ``conda`` or the SDK library path appended to the ``LD_LIBRARY_PATH`` environment variable.
 Please see the `cuQuantum SDK <https://developer.nvidia.com/cuquantum-sdk>`_ install guide for more information.
 
-Installing from source
-======================
+Install L-GPU from source
+=========================
 
 To install Lightning-GPU from the package sources using the direct SDK path, Lightning-Qubit should be install before Lightning-GPU:
 
@@ -270,8 +260,8 @@ The Lightning-GPU can then be installed with ``pip``:
 
 To simplify the build, we recommend using the containerized build process described in section `Docker support <docker-support>`_.
 
-Build Lightning-GPU with MPI
-----------------------------
+Install L-GPU with MPI
+======================
 
 Building Lightning-GPU with MPI also requires the ``NVIDIA cuQuantum SDK`` (currently supported version: `custatevec-cu11 <https://pypi.org/project/cuquantum-cu11/>`_), ``mpi4py`` and ``CUDA-aware MPI`` (Message Passing Interface).
 ``CUDA-aware MPI`` allows data exchange between GPU memory spaces of different nodes without the need for CPU-mediated transfers.
@@ -289,11 +279,8 @@ Then Lightning-GPU with MPI support can then be installed with ``pip``:
     CMAKE_ARGS="-DENABLE_MPI=ON"  PL_BACKEND="lightning_gpu" python -m pip install -e .
 
 
-Testing
-=======
-
-Test Lightning-GPU with MPI support
------------------------------------
+Test L-GPU with MPI
+===================
 
 You may test the Python layer of the MPI enabled plugin as follows:
 
@@ -317,25 +304,22 @@ The C++ code is tested with
 
 .. _lightning-kokkos:
 
-Lightning-Kokkos
-****************
-
-PyPI wheels
-===========
+Lightning-Kokkos installation
+*****************************
 
-On linux systems, `lightning.kokkos` and be installed with the OpenMP backend by providing the optional ``[kokkos]`` tag:
+On linux systems, `lightning.kokkos` with the OpenMP backend can be installed by providing the optional ``[kokkos]`` tag:
 
 .. code-block:: console
 
     $ pip install pennylane-lightning[kokkos]
 
-Installing from source
-======================
+Install L-Kokkos from source
+============================
 
 As Kokkos enables support for many different HPC-targeted hardware platforms, `lightning.kokkos` can be built to support any of these platforms when building from source.
 
 We suggest first installing Kokkos with the wanted configuration following the instructions found in the `Kokkos documentation <https://kokkos.github.io/kokkos-core-wiki/building.html>`_.
-For example,
+For example, the following will build Kokkos for NVIDIA A100 cards
 
 .. code-block:: console
 
diff --git a/doc/code/__init__.rst b/doc/code/__init__.rst
index b77d3c91a3..1e4eb7d3c8 100644
--- a/doc/code/__init__.rst
+++ b/doc/code/__init__.rst
@@ -1,5 +1,5 @@
-pennylane_lightning
-===================
+Python API
+==========
 
 This section contains the API documentation for the Lightning packages.
 
diff --git a/doc/index.rst b/doc/index.rst
index 695c3bb9d0..f48d86c567 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -14,7 +14,7 @@ Lightning plugins
 
 
 Devices
--------
+*******
 
 The Lightning ecosystem provides the following devices:
 
diff --git a/doc/lightning_gpu/device.rst b/doc/lightning_gpu/device.rst
index b280b6a50d..49ad3acf37 100644
--- a/doc/lightning_gpu/device.rst
+++ b/doc/lightning_gpu/device.rst
@@ -1,4 +1,4 @@
-Lightning-GPU device
+Lightning GPU device
 ======================
 
 The ``lightning.gpu`` device is an extension of PennyLane's built-in ``lightning.qubit`` device.
diff --git a/doc/lightning_qubit/development/index.rst b/doc/lightning_qubit/development/index.rst
index 7eb4a918e4..90489e166f 100644
--- a/doc/lightning_qubit/development/index.rst
+++ b/doc/lightning_qubit/development/index.rst
@@ -20,5 +20,5 @@ Lightning Qubit
 .. toctree::
    :hidden:
 
-   avx_kernels/index
    add_gate_kernel
+   avx_kernels/index
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 2512d6341a..177275ec13 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -204,7 +204,7 @@ def _mebibytesToBytes(mebibytes):
     }
 
     class LightningGPU(LightningBase):  # pylint: disable=too-many-instance-attributes
-        """PennyLane-Lightning-GPU device.
+        """PennyLane Lightning GPU device.
 
         A GPU-backed Lightning device using NVIDIA cuQuantum SDK.
 

From 3d2dde4265eb7f9419e067b34de4dee7e80e290b Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 26 Oct 2023 19:44:07 +0000
Subject: [PATCH 27/30] skip some ci checks

---
 .github/workflows/tests_gpu_kokkos.yml        | 2 +-
 .github/workflows/tests_linux.yml             | 2 +-
 .github/workflows/tests_windows.yml           | 2 +-
 .github/workflows/tests_without_binary.yml    | 2 +-
 .github/workflows/wheel_linux_x86_64.yml      | 2 +-
 .github/workflows/wheel_linux_x86_64_cu11.yml | 2 +-
 .github/workflows/wheel_macos_arm64.yml       | 2 +-
 .github/workflows/wheel_macos_x86_64.yml      | 2 +-
 .github/workflows/wheel_noarch.yml            | 2 +-
 .github/workflows/wheel_win_x86_64.yml        | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/tests_gpu_kokkos.yml b/.github/workflows/tests_gpu_kokkos.yml
index 2879be1c8b..b090031111 100644
--- a/.github/workflows/tests_gpu_kokkos.yml
+++ b/.github/workflows/tests_gpu_kokkos.yml
@@ -10,7 +10,7 @@ on:
         type: string
         required: true
         description: The version of PennyLane to use. Valid values are either 'stable' (most recent git-tag) or 'latest' (most recent commit from master)
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/tests_linux.yml b/.github/workflows/tests_linux.yml
index b5df17d2ea..2ea429fe66 100644
--- a/.github/workflows/tests_linux.yml
+++ b/.github/workflows/tests_linux.yml
@@ -10,7 +10,7 @@ on:
         type: string
         required: true
         description: The version of PennyLane to use. Valid values are either 'stable' (most recent git-tag) or 'latest' (most recent commit from master)
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/tests_windows.yml b/.github/workflows/tests_windows.yml
index 36f34bf23b..98cd80b5ee 100644
--- a/.github/workflows/tests_windows.yml
+++ b/.github/workflows/tests_windows.yml
@@ -3,7 +3,7 @@ on:
   push:
     branches:
       - master
-  pull_request:
+  #pull_request:
 
 concurrency:
   group: tests_windows-${{ github.ref }}
diff --git a/.github/workflows/tests_without_binary.yml b/.github/workflows/tests_without_binary.yml
index bb21da4cfb..845b215e96 100644
--- a/.github/workflows/tests_without_binary.yml
+++ b/.github/workflows/tests_without_binary.yml
@@ -10,7 +10,7 @@ on:
         type: string
         required: true
         description: The version of PennyLane to use. Valid values are either 'stable' (most recent git-tag) or 'latest' (most recent commit from master)
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/wheel_linux_x86_64.yml b/.github/workflows/wheel_linux_x86_64.yml
index 45428e71f1..5d14a2670b 100644
--- a/.github/workflows/wheel_linux_x86_64.yml
+++ b/.github/workflows/wheel_linux_x86_64.yml
@@ -9,7 +9,7 @@ env:
   GCC_VERSION: 11
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/wheel_linux_x86_64_cu11.yml b/.github/workflows/wheel_linux_x86_64_cu11.yml
index 2b270c4b0e..c4dd47bc68 100644
--- a/.github/workflows/wheel_linux_x86_64_cu11.yml
+++ b/.github/workflows/wheel_linux_x86_64_cu11.yml
@@ -11,7 +11,7 @@ env:
   CUDA_VERSION_MINOR: 5
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/wheel_macos_arm64.yml b/.github/workflows/wheel_macos_arm64.yml
index 409eab594d..6cd9e1c701 100644
--- a/.github/workflows/wheel_macos_arm64.yml
+++ b/.github/workflows/wheel_macos_arm64.yml
@@ -6,7 +6,7 @@ name: Wheel::MacOS::ARM
 # **Who does it impact**: Wheels to be uploaded to PyPI.
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/wheel_macos_x86_64.yml b/.github/workflows/wheel_macos_x86_64.yml
index f5d779f07a..6c0b74ba9a 100644
--- a/.github/workflows/wheel_macos_x86_64.yml
+++ b/.github/workflows/wheel_macos_x86_64.yml
@@ -6,7 +6,7 @@ name: Wheel::MacOS::Intel
 # **Who does it impact**: Wheels to be uploaded to PyPI.
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/wheel_noarch.yml b/.github/workflows/wheel_noarch.yml
index d3e6622730..000a800cf2 100644
--- a/.github/workflows/wheel_noarch.yml
+++ b/.github/workflows/wheel_noarch.yml
@@ -6,7 +6,7 @@ name: Wheel::Any::None
 # **Who does it impact**: Wheels to be uploaded to PyPI.
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master
diff --git a/.github/workflows/wheel_win_x86_64.yml b/.github/workflows/wheel_win_x86_64.yml
index 19de8e3275..536fb6dc18 100644
--- a/.github/workflows/wheel_win_x86_64.yml
+++ b/.github/workflows/wheel_win_x86_64.yml
@@ -6,7 +6,7 @@ name: Wheel::Windows::x86_64
 # **Who does it impact**: Wheels to be uploaded to PyPI.
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - master

From 5a1119aadd0a006a9d6aeaa37b9ad28d78effa93 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 26 Oct 2023 20:15:22 +0000
Subject: [PATCH 28/30] only allow rank0 read from io

---
 mpitests/test_adjoint_jacobian.py | 42 +++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index 3657c336f8..769f834968 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -1163,22 +1163,37 @@ def test_integration_H2_Hamiltonian(
     """Tests getting the total energy and its derivatives for an H2 Hamiltonian."""
     _ = pytest.importorskip("openfermionpyscf")
 
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+
     n_electrons = 2
     np.random.seed(1337)
 
-    str_path = create_xyz_file
-    symbols, coordinates = qml.qchem.read_structure(str(str_path), outpath=str(str_path.parent))
-
-    H, qubits = qml.qchem.molecular_hamiltonian(
-        symbols,
-        coordinates,
-        method="pyscf",
-        basis="6-31G",
-        active_electrons=n_electrons,
-        name="h2",
-        outpath=str(str_path.parent),
-        load_data=True,
-    )
+    if rank == 0:
+        str_path = create_xyz_file
+        symbols, coordinates = qml.qchem.read_structure(str(str_path), outpath=str(str_path.parent))
+
+        H, qubits = qml.qchem.molecular_hamiltonian(
+            symbols,
+            coordinates,
+            method="pyscf",
+            basis="6-31G",
+            active_electrons=n_electrons,
+            name="h2",
+            outpath=str(str_path.parent),
+            load_data=True,
+        )
+    else:
+        symbols = None
+        coordinates = None
+        H = None
+        qubits = None
+
+    symbols = comm.bcast(symbols, root=0)
+    coordinates = comm.bcast(coordinates, root=0)
+    H = comm.bcast(H, root=0)
+    qubits = comm.bcast(qubits, root=0)
+
     hf_state = qml.qchem.hf_state(n_electrons, qubits)
     _, doubles = qml.qchem.excitations(n_electrons, qubits)
 
@@ -1211,6 +1226,7 @@ def circuit_compare(params, excitations):
     jac_func_comp = qml.jacobian(circuit_compare)
 
     params = qml.numpy.array([0.0] * len(doubles), requires_grad=True)
+
     jacs = jac_func(params, excitations=doubles)
     jacs_comp = jac_func_comp(params, excitations=doubles)
 

From 384343c955786ffc43db43e66615b49db9411ed5 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 26 Oct 2023 20:18:42 +0000
Subject: [PATCH 29/30] quick fix

---
 mpitests/test_adjoint_jacobian.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index dfb5da2f6f..b941d11f6b 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -1223,6 +1223,9 @@ def circuit_compare(params, excitations):
                 qml.SingleExcitation(params[i], wires=excitation)
         return qml.expval(H)
 
+    jac_func = qml.jacobian(circuit)
+    jac_func_comp = qml.jacobian(circuit_compare)
+
     params = qml.numpy.array([0.0] * len(doubles), requires_grad=True)
 
     jacs = jac_func(params, excitations=doubles)

From 7b2e8d3f70d3b610bbf6d881bdf0d6feaf6fbbb6 Mon Sep 17 00:00:00 2001
From: Shuli Shu <08cnbj@gmail.com>
Date: Thu, 26 Oct 2023 20:24:10 +0000
Subject: [PATCH 30/30] skip cpp mpi tests

---
 .github/workflows/tests_linux_x86_mpi_gpu.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests_linux_x86_mpi_gpu.yml b/.github/workflows/tests_linux_x86_mpi_gpu.yml
index 9d3edfe913..82f3be7b0e 100644
--- a/.github/workflows/tests_linux_x86_mpi_gpu.yml
+++ b/.github/workflows/tests_linux_x86_mpi_gpu.yml
@@ -14,7 +14,7 @@ on:
   push:
     branches:
       - main
-  #pull_request:
+  pull_request:
 
 env:
   COVERAGE_FLAGS: "--cov=pennylane_lightning --cov-report=term-missing --cov-report=xml:./coverage.xml --no-flaky-report -p no:warnings --tb=native"
@@ -28,7 +28,8 @@ concurrency:
 
 jobs:
   cpp_tests:
-    if: contains(github.event.pull_request.labels.*.name, 'ci:use-multi-gpu-runner') || (inputs.lightning-version != '' && inputs.pennylane-version != '')
+    #if: contains(github.event.pull_request.labels.*.name, 'ci:use-multi-gpu-runner') || (inputs.lightning-version != '' && inputs.pennylane-version != '')
+    if: ${{ github.sha == 'xxxxxxxx' }}
     runs-on:
       - self-hosted
       - linux