From 1f973d0585de4492a7c44853b68afdb7bbb1a6ce Mon Sep 17 00:00:00 2001
From: Shuli Shu <31480676+multiphaseCFD@users.noreply.github.com>
Date: Wed, 25 Oct 2023 14:28:56 -0400
Subject: [PATCH] Add python layer for LGPU_MPI (#518)

* add LGPU cpp layer

* update measurement

* add openmp to adjgpu

* Auto update version

* Add support for building multiple backend simulators (#497)

* Add PL_BACKEND_LIST

* Update the support

* Exclude Python bindings

* Update HermitianObs name scope conflicts

* Auto update version

* Cleanup

* Update CI to build and check C++ tests of multiple backends (Linux)

* Update changelog

* Update .github/workflows/tests_linux.yml

Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>

* Apply code review suggestions

* Update .github/workflows/tests_linux.yml

Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>

---------

Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>

* add python layer & isingxy gate in the cpp layer

* add batched adjoint method

* Update DefaultQubit to DefaultQubitLegacy (#500)

* Update DefaultQubit to DefaultQubitLegacy

* Update changelog

* update pylint disable on fallback

* Auto update version

* add batch support for adjoint method

* add gitignore

* tidy up code

* Auto update version

* make format

* revert complexT delete in LKokkosBingds

* make format

* update based on tidy

* fix tidy format

* add_gpu_runners_tests

* add cuquantum_sdk path to ci workflow

* debug

* add path to cuquantum sdk

* add python layer tests in ci workflow

* ci tests

* quick fix

* skip pr ci for some workflows

* quick fix

* quick fix

* update python ci tests

* remove dependency on lightning_qubit in ci

* fix directory

* fix directory

* quick fix

* quick fix

* test for cuda-12

* update measurement

* updata cu12 workflows

* add getDataVector support to LQubitRaw

* install lightning.qubit before lightning.gpu in ci

* update test_obs

* activate all CI checks

* quick fix

* tidy up code

* tidy up code

* make format

* update ci for more tests

* tidy up code

* tidy up code

* tidy up code

* make format

* fix for codecov

* codecov fix

* quick fix

* quick fix

* quick fix

* quick test

* fix test

* fix tests

* another quick fix

* coverage fix

* update ci tests

* update ci for no binary

* codecov fix

* update adj tests for no binary case

* update python layer tests

* fix codecov

* make format

* initial commit for MPI

* revert to cu11

* enable more py tests

* update CI

* upload codecov ci

* add more tests for statevectorcudamanaged

* add more unit tests

* add more tests

* make format

* add more cpp tests

* skip cpp tests pauli param gates

* make format

* add more files to gitignore

* Auto update version

* init commit

* Trigger CI

* update gpu runner

* quick fix

* update fix

* add cpp layer for LGPU-MPI backend

* add py layer

* quick fix

* make format

* fix for fp32 support in expval calculation

* quick fix

* fix for cray_mpich_serialize_py

* copy to move for hamiltonian operation

* add unit tests for adjoint method

* add more tests

* resolve comments py layer

* remove omp support in LGPU

* update version

* Auto update version

* fix based on comments

* Add L-GPU and L-Kokkos as package extras (#515)

* Add L-GPU and L-Kokkos as package extras

* Auto update version

* Update changelog

* Temp enable the x86 wheel cache

* Return wheel storage functionality to normal

* Update readme

* Auto update version

* Trigger CI

* Update README.rst

Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>

---------

Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>

* Auto update version

* make format

* remove sparseH

* remove applyHostMatrixGate

* Add wheel_linux_x86_64_cu11.yml (#517)

* Add wheel_linux_x86_64_cu11.yml

* echo COMPILER=g++ >>

* python3.9 => python

* reinstall g++11

* Try that

* Use env vars for versions.

* Fix var syntax.

* Hardcode versions

* Fix custatevec-cu11

* Revert triggers.

* Update changelog [skip ci]

* resolve more comments

* add more tests to non_param gates

* resolve cpp tests comments

* remove unused methods in measurement class

* remove unused methods

* resolve more comments

* add changelog and matrixhasher

* quick update

* add more tests and merge base branch

* add mpi unit tests for algorithm base class

* add more unit tests for utils

* ctor test for MPIManager

* Add mpi tests to LGPU (#519)

* Initial commit mpi tests

* Remove label guards

* Fix PL_DEVICE

* Install lightning_qubit.

* Fix ENABLE_MPI

* print cuquantum

* export cu_sdk

* revert define

* Debug cpp tests.

* Debug cpp tests.

* Fix cmake options.

* Compile with mpicxx

* Specify backend.

* Specify backend.

* Remove obsolete line.

* Specify cov backend

* Merge test/cov & try simplifying python

* if-no-files-found: error and fix python tests.

* Fix mpi find

* Install real lightning.

* Revert python tests.

* Hardcode backend values in python tests

* Install lightning_qubit with gpu in python tests

* Remove explicit mpich mentions.

* Parametrize mpilib name.

* Add openmpi tests.

* Build only openmpi python tests.

* Add timeouts

* test/test_apply.py

* Revert pull triggers.

* Clean gpu-mpi test workflows.

* Revert to 804ed24.

* Revert back.

* Update tests_linux_x86_mpi.yml [ci skip]

* Add jobs dep.

* Remove module unload

* Simplify mpi-gpu tests.

* trigger CI

* unset CFLAGS.

* set CFLAGS

* Revert triggers.

* Fix pull_request: [skip ci]

* trigger CI

* Rename test_gpu_cu11.yml -> tests_gpu_cu11.yml [skip ci]

* add CI checks for cpp unit tests

* add cpp layer ci check for mpi backend

* Auto update version

* remove redundant blank lines

* tidy up code

* Trigger CI

* remove single GPU backend tests in mpi ci

* upload codecov results

* add more unit tests

* add tests for pauli word based expval

* add more docs

* add more tests

* skip lcov for native gates

* add mpi_helpers

* add more docstrings

* add change log

* Auto update version

* Auto update version

* fix failures caused by merging

* add changelog

* Trigger multi-GPU runner

* add more fp32 tests to the measurement class

* add number of devices and mpi procs check

* Add coverage for py-mpitests. (#522)

* Initial commit mpi tests

* Remove label guards

* Fix PL_DEVICE

* Install lightning_qubit.

* Fix ENABLE_MPI

* print cuquantum

* export cu_sdk

* revert define

* Debug cpp tests.

* Debug cpp tests.

* Fix cmake options.

* Compile with mpicxx

* Specify backend.

* Specify backend.

* Remove obsolete line.

* Specify cov backend

* Merge test/cov & try simplifying python

* if-no-files-found: error and fix python tests.

* Fix mpi find

* Install real lightning.

* Revert python tests.

* Hardcode backend values in python tests

* Install lightning_qubit with gpu in python tests

* Remove explicit mpich mentions.

* Parametrize mpilib name.

* Add openmpi tests.

* Build only openmpi python tests.

* Add timeouts

* test/test_apply.py

* Revert pull triggers.

* Clean gpu-mpi test workflows.

* Revert to 804ed24.

* Revert back.

* Update tests_linux_x86_mpi.yml [ci skip]

* Add jobs dep.

* Remove module unload

* Simplify mpi-gpu tests.

* trigger CI

* unset CFLAGS.

* set CFLAGS

* Revert triggers.

* Fix pull_request: [skip ci]

* trigger CI

* Rename test_gpu_cu11.yml -> tests_gpu_cu11.yml [skip ci]

* Add coverage for py-mpitests.

* Upload mpi-gpu test coverage.

* Try other paths.

* trigger CI

* Add mpi tests.

* Fix couple tests.

* Fixx test_apply tests?

* Add MPI sparse measurements.

* Fix format.

* Add MPI_Init checks in MPIManager constructors.

* Reformat mpitests and add cov for proc > dev error.

* Refactor makefile.

* Revert to full mpirun path.

* Fix couple tests.

* Name coverage after matrix.mpilib.

* Remove oversubscribe MPI test.

* Update changelog [skip ci].

---------

Co-authored-by: Shuli <08cnbj@gmail.com>

* add more tests in obs base class

* Revert "Merge branch 'add_LGPUMPI' into add_py_LGPUMPI"

This reverts commit d3af81987fa6553d1975abf9b5aa9c17bd0edf63, reversing
changes made to 6ad1c7c8fd4cee21d7ca3b91aa349e7d1dd2e8ed.

* Fix pylint [skip ci]

* resolve comments on source codes and tidy up code

* Use CRTP to define initSV and remove initSV_MPI

* resolve more typos

* resolve more typoes

* resolve adjoint class

* remove py&pybind layer

* resolve more comments

* Remove redundant blank line

* add num mpi & ngpudevice proc check

* fix typo

* remove unused lines

* add more tests

* remove initsv_mpi

* add reset

* make format

* use_mpi as _use_mpi in QuantumScriptSerializer

* resolve more comments

* check->require

* make format

* rename mpi workflow

* Add sparseH for LGPU (#526)

* Init commit

* Fix std::endl;

* Use more generic indices in base std::size_t.

* add pybind layer

* add python layer

* Quick and dirty spham bindings.

* Add sparse_ham serialization.

* Add sparse_ham tests in tests/test_adjoint_jacobian.py'

* Bug fix sparse product.

* add sparseH

* Trigger CI

* Fix python bindings LGPU idxT

* Fix serial tests and update changelog.

* add more unit tests for sparseH base class

* Fix tidy & sparse adjoint test device name.

* Fix tidy warning for sparse_ham.

* Send backend-specific ops in respective modules.

* Fix sparse_hamiltonianmpi_c and add getWires test.

* Add sparseH diff capability in LQ.

* Add sparse Hamiltonian support for Lightning-Kokkos (#527)

* Use more generic indices in base std::size_t.

* Quick and dirty spham bindings.

* Add sparse_ham serialization.

* Add sparse_ham tests in tests/test_adjoint_jacobian.py'

* Bug fix sparse product.

* Fix python bindings LGPU idxT

* Fix serial tests and update changelog.

* Fix tidy & sparse adjoint test device name.

* Fix tidy warning for sparse_ham.

* Send backend-specific ops in respective modules.

* Fix sparse_hamiltonianmpi_c and add getWires test.

* Fix clang tidy

* Comment workflows but tidy.

* Fix tidy warn

* Add override to sp::getWires

* Restore triggers

* Update tests_linux_x86_mpi.yml

* Add constructibility tests.

* Move L-Kokkos-CUDA tests to workflow call, called from tests_gpu_cu11.yml.

* Remove GPU deadlock.

* Bug fix Python MPI.

* Upload both outputs.

* Update gcc version in format.yml.

* Update .github/CHANGELOG.md [skip ci]

Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>

* Update .github/workflows/tests_gpu_kokkos.yml [skip ci]

Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>

* rename argn [skip ci]

* Remove unused lines [skip ci]

* Fix SparseHamiltonianBase::isEqual. [skip ci]

* Trigger CI

* Auto update version

* Trigger CI

* resolve comments

* rename dev_kokkos to dev

* Fix tidy.

---------

Co-authored-by: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>
Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com>

* update work flow

* resolve comments for unit tests

* add more unit tests for sparseH

* quick fix

* add fp32 tests

* tidy up code

* remove redundant lines

* add pylintrc to mpitests

* add mpitests dir to commit-config

* add mpitests to .coveragerc

* add mpitests path to coveragerc

* Fix mpitests/test_adjoint_jacobian.py

* Fix pylint in mpitests/test_apply [skip ci].

* pylint fix for mpi py_d_e_m_p tets

* tidy up cpp code

* fix codefactor

* revert skipp condition for openfermionpyscf

* codefactor fix

* add sparseH tests for mpi backend

* Install openfermion in CI workflows and fix H2 QChem integration test.

* update changelog

---------

Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Ali Asadi <ali@xanadu.ai>
Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>
Co-authored-by: Lee James O'Riordan <mlxd@users.noreply.github.com>
Co-authored-by: Vincent Michaud-Rioux <vincent.michaud-rioux@xanadu.ai>
---
 .coveragerc                                   |    3 +-
 .github/CHANGELOG.md                          |   12 +
 .github/workflows/format.yml                  |    5 +-
 .github/workflows/tests_gpu_cu11.yml          |    2 +-
 .../{tests_gpu.yml => tests_gpu_kokkos.yml}   |    3 +-
 .github/workflows/tests_linux.yml             |    3 +
 ...86_mpi.yml => tests_linux_x86_mpi_gpu.yml} |  135 +-
 .pre-commit-config.yaml                       |    2 +-
 Makefile                                      |    2 +-
 mpitests/.pylintrc                            |   52 +
 mpitests/conftest.py                          |  120 ++
 mpitests/test_adjoint_jacobian.py             | 1364 +++++++++++++++++
 mpitests/test_apply.py                        | 1049 +++++++++++++
 mpitests/test_device.py                       |   54 +
 mpitests/test_expval.py                       |  332 ++++
 mpitests/test_measurements_sparse.py          |  168 ++
 mpitests/test_probs.py                        |  312 ++++
 pennylane_lightning/core/_serialize.py        |   91 +-
 pennylane_lightning/core/_version.py          |    2 +-
 pennylane_lightning/core/lightning_base.py    |    6 +-
 .../core/src/bindings/Bindings.cpp            |    8 +
 .../core/src/bindings/Bindings.hpp            |    6 +-
 .../core/src/bindings/BindingsMPI.hpp         |  483 ++++++
 .../core/src/observables/Observables.hpp      |  108 ++
 .../observables/tests/Test_Observables.cpp    |   83 +-
 .../tests/mpi/Test_ObservablesMPI.cpp         |   88 ++
 .../simulators/lightning_gpu/CMakeLists.txt   |    4 +
 .../lightning_gpu/bindings/LGPUBindings.hpp   |   90 +-
 .../bindings/LGPUBindingsMPI.hpp              |  323 ++++
 .../measurements/MeasurementsGPU.hpp          |   14 +-
 .../measurements/MeasurementsGPUMPI.hpp       |   15 +-
 .../Test_StateVectorCudaManaged_Expval.cpp    |   34 +-
 .../mpi/Test_StateVectorCudaMPI_Expval.cpp    |   34 +-
 .../observables/ObservablesGPU.cpp            |    3 +
 .../observables/ObservablesGPU.hpp            |   83 +
 .../observables/ObservablesGPUMPI.cpp         |    3 +
 .../observables/ObservablesGPUMPI.hpp         |   94 ++
 .../observables/tests/Test_ObservablesGPU.cpp |   42 +
 .../tests/mpi/Test_ObservablesGPUMPI.cpp      |   54 +
 .../lightning_gpu/utils/LinearAlg.hpp         |   48 +-
 .../lightning_gpu/utils/MPILinearAlg.hpp      |   12 +-
 .../utils/tests/Test_LinearAlgebra.cpp        |   16 +-
 .../utils/tests/mpi/Test_LinearAlgebraMPI.cpp |   15 +-
 .../lightning_kokkos/StateVectorKokkos.hpp    |   19 +-
 .../bindings/LKokkosBindings.hpp              |   54 +
 .../observables/ObservablesKokkos.cpp         |    3 +
 .../observables/ObservablesKokkos.hpp         |   71 +
 .../tests/Test_ObservablesKokkos.cpp          |   14 +
 .../bindings/LQubitBindings.hpp               |   58 +-
 .../observables/ObservablesLQubit.cpp         |    6 +
 .../observables/ObservablesLQubit.hpp         |   71 +
 .../tests/Test_ObservablesLQubit.cpp          |   15 +
 .../core/src/utils/TestHelpers.hpp            |   28 +-
 .../lightning_gpu/lightning_gpu.py            |  162 +-
 requirements-dev.txt                          |    4 +-
 tests/test_adjoint_jacobian.py                |   68 +-
 tests/test_device.py                          |   15 +
 tests/test_serialize.py                       |   10 +
 58 files changed, 5817 insertions(+), 158 deletions(-)
 rename .github/workflows/{tests_gpu.yml => tests_gpu_kokkos.yml} (99%)
 rename .github/workflows/{tests_linux_x86_mpi.yml => tests_linux_x86_mpi_gpu.yml} (55%)
 create mode 100644 mpitests/.pylintrc
 create mode 100644 mpitests/conftest.py
 create mode 100644 mpitests/test_adjoint_jacobian.py
 create mode 100644 mpitests/test_apply.py
 create mode 100644 mpitests/test_device.py
 create mode 100644 mpitests/test_expval.py
 create mode 100644 mpitests/test_measurements_sparse.py
 create mode 100644 mpitests/test_probs.py
 create mode 100644 pennylane_lightning/core/src/bindings/BindingsMPI.hpp
 create mode 100644 pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp

diff --git a/.coveragerc b/.coveragerc
index 5c4cfff8a2..e9d7866fff 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -2,7 +2,8 @@
 [run]
 source = pennylane_lightning
 omit =
-    tests*
+    tests/*
+    mpitests/*
 
 [report]
 # Regexes for lines to exclude from consideration
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 3ee6676579..8cb50c0517 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,15 @@
 
 ### New features since last release
 
+* Add `SparseHamiltonian` support for Lightning-Qubit and Lightning-GPU.
+  [(#526)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/526)
+
+* Add `SparseHamiltonian` support for Lightning-Kokkos.
+  [(#527)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/527)
+
+* Integrate python/pybind layer of distributed Lightning-GPU into the Lightning monorepo with python unit tests.
+  [(#518)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/518)
+
 * Integrate the distributed C++ backend of Lightning-GPU into the Lightning monorepo.
   [(#514)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/514)
 
@@ -46,6 +55,9 @@
 
 ### Improvements
 
+* Improve Python testing for Lightning-GPU (+MPI) by adding jobs in Actions files and adding Python tests to increase code coverage.   
+  [(#522)](https://github.com/PennyLaneAI/pennylane-lightning/pull/522)
+
 * Add support for `pip install pennylane-lightning[kokkos]` for the OpenMP backend.
   [(#515)](https://github.com/PennyLaneAI/pennylane-lightning/pull/515)
 
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index 6cbde05023..350312d253 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -84,7 +84,7 @@ jobs:
           cp -rf ${{ github.workspace}}/Kokkos_install/${{ matrix.exec_model }}/* Kokkos/
 
       - name: Install dependencies
-        run: sudo apt update && sudo apt -y install clang-tidy-14 cmake g++-10 ninja-build libomp-14-dev
+        run: sudo apt update && sudo apt -y install clang-tidy-14 cmake gcc-11 g++-11 ninja-build libomp-14-dev
         env:
           DEBIAN_FRONTEND: noninteractive
 
@@ -96,5 +96,6 @@ jobs:
             -DBUILD_TESTS=ON \
             -DENABLE_WARNINGS=ON \
             -DPL_BACKEND=${{ matrix.pl_backend }} \
-            -DCMAKE_CXX_COMPILER="$(which g++-10)"
+            -DCMAKE_CXX_COMPILER="$(which g++-11)" \
+            -DCMAKE_C_COMPILER="$(which gcc-11)"
           cmake --build ./Build
\ No newline at end of file
diff --git a/.github/workflows/tests_gpu_cu11.yml b/.github/workflows/tests_gpu_cu11.yml
index cb15aca14e..925491efe6 100644
--- a/.github/workflows/tests_gpu_cu11.yml
+++ b/.github/workflows/tests_gpu_cu11.yml
@@ -212,7 +212,7 @@ jobs:
       - name: Install required packages
         run: |
           python -m pip install pip~=22.0
-          python -m pip install ninja cmake custatevec-cu11 pytest pytest-mock flaky pytest-cov
+          python -m pip install ninja cmake custatevec-cu11 pytest pytest-mock flaky pytest-cov openfermionpyscf
 
       - name: Build and install package
         env: 
diff --git a/.github/workflows/tests_gpu.yml b/.github/workflows/tests_gpu_kokkos.yml
similarity index 99%
rename from .github/workflows/tests_gpu.yml
rename to .github/workflows/tests_gpu_kokkos.yml
index bbf240d91c..e65aa0697d 100644
--- a/.github/workflows/tests_gpu.yml
+++ b/.github/workflows/tests_gpu_kokkos.yml
@@ -1,4 +1,4 @@
-name: Testing (GPU)
+name: Testing::LKokkos::GPU
 on:
   pull_request:
   push:
@@ -237,6 +237,7 @@ jobs:
         run: |
           cd main
           python -m pip install -r requirements-dev.txt
+          python -m pip install openfermionpyscf
 
       - name: Install ML libraries for interfaces
         run: |
diff --git a/.github/workflows/tests_linux.yml b/.github/workflows/tests_linux.yml
index ac8a05c593..672327341c 100644
--- a/.github/workflows/tests_linux.yml
+++ b/.github/workflows/tests_linux.yml
@@ -118,6 +118,7 @@ jobs:
         run: |
           cd main
           python -m pip install -r requirements-dev.txt
+          python -m pip install openfermionpyscf
 
       - name: Install Stable PennyLane
         if: inputs.pennylane-version == 'stable'
@@ -244,6 +245,7 @@ jobs:
         run: |
           cd main
           python -m pip install -r requirements-dev.txt
+          python -m pip install openfermionpyscf
 
       - name: Install Stable PennyLane
         if: inputs.pennylane-version == 'stable'
@@ -412,6 +414,7 @@ jobs:
         run: |
           cd main
           python -m pip install -r requirements-dev.txt
+          python -m pip install openfermionpyscf
 
       - name: Install Stable PennyLane
         if: inputs.pennylane-version == 'stable'
diff --git a/.github/workflows/tests_linux_x86_mpi.yml b/.github/workflows/tests_linux_x86_mpi_gpu.yml
similarity index 55%
rename from .github/workflows/tests_linux_x86_mpi.yml
rename to .github/workflows/tests_linux_x86_mpi_gpu.yml
index 69040df8e3..e879415492 100644
--- a/.github/workflows/tests_linux_x86_mpi.yml
+++ b/.github/workflows/tests_linux_x86_mpi_gpu.yml
@@ -157,6 +157,112 @@ jobs:
           rm -rf * .git .gitignore .github
           pip cache purge
 
+
+  python_tests:
+    if: contains(github.event.pull_request.labels.*.name, 'ci:use-multi-gpu-runner') || (inputs.lightning-version != '' && inputs.pennylane-version != '')
+    runs-on:
+      - self-hosted
+      - linux
+      - x64
+      - ubuntu-22.04
+      - multi-gpu
+    strategy:
+      max-parallel: 1
+      matrix:
+        mpilib: ["mpich", "openmpi"]
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout pennyLane-lightning
+        uses: actions/checkout@v3
+        with:
+          fetch-tags: true
+
+      - name: Switch to stable build of Lightning
+        if: inputs.lightning-version == 'stable'
+        run: |
+          git fetch tags --force
+          git checkout $(git tag | sort -V | tail -1)
+
+      - uses: actions/setup-python@v4
+        id: setup_python
+        name: Install Python
+        with:
+          python-version: '3.9'
+
+      # Since the self-hosted runner can be re-used. It is best to set up all package
+      # installations in a virtual environment that gets cleaned at the end of each workflow run
+      - name: Setup Python virtual environment
+        id: setup_venv
+        env:
+          VENV_NAME: ${{ github.workspace }}/venv_${{ steps.setup_python.outputs.python-version }}_${{ github.sha }}
+        run: |
+          # Clear any pre-existing venvs
+          rm -rf venv_*
+
+          # Create new venv for this workflow_run
+          python --version
+          python -m venv ${{ env.VENV_NAME }}
+
+          # Add the venv to PATH for subsequent steps
+          echo ${{ env.VENV_NAME }}/bin >> $GITHUB_PATH
+
+          # Adding venv name as an output for subsequent steps to reference if needed
+          source ${{ env.VENV_NAME }}/bin/activate
+          echo "venv_name=${{ env.VENV_NAME }}" >> $GITHUB_OUTPUT
+
+      - name: Display Python-Path
+        id: python_path
+        run: |
+          py_path=$(which python)
+          echo "Python Interpreter Path => $py_path"
+          echo "python=$py_path" >> $GITHUB_OUTPUT
+
+          pip_path=$(which python)
+          echo "PIP Path => $pip_path"
+          echo "pip=$pip_path" >> $GITHUB_OUTPUT
+
+      - name: Install Latest PennyLane
+        # We want to install the latest PL on non workflow_call events
+        if: inputs.pennylane-version == 'latest'  || inputs.pennylane-version == ''
+        run: python -m pip install git+https://github.com/PennyLaneAI/pennylane.git@master
+
+      - name: Install required packages
+        run: |
+          source /etc/profile.d/modules.sh && module use /opt/modules/ && module load ${{ matrix.mpilib }}
+          python -m pip install pip~=22.0
+          python -m pip install ninja cmake custatevec-cu11 pytest pytest-mock flaky pytest-cov mpi4py openfermionpyscf
+          SKIP_COMPILATION=True PL_BACKEND=lightning_qubit python -m pip install -e . -vv
+
+      - name: Build and install package
+        env:
+          CUQUANTUM_SDK: $(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum/lib')")
+        run: |
+          source /etc/profile.d/modules.sh && module use /opt/modules/ && module load ${{ matrix.mpilib }}
+          CMAKE_ARGS="-DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DENABLE_MPI=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=${{ env.CI_CUDA_ARCH }} -DPython_EXECUTABLE=${{ steps.python_path.outputs.python }}" \
+          PL_BACKEND=lightning_gpu python -m pip install -e . --verbose
+
+      - name: Run unit tests for MPI-enabled lightning.gpu device
+        run: |
+          source /etc/profile.d/modules.sh && module use /opt/modules/ && module load ${{ matrix.mpilib }}
+          PL_DEVICE=lightning.gpu /opt/mpi/${{ matrix.mpilib }}/bin/mpirun -np 2 python -m pytest ./mpitests $COVERAGE_FLAGS
+          mv coverage.xml coverage-${{ github.job }}-lightning_gpu_${{ matrix.mpilib }}-main.xml
+          # PL_DEVICE=lightning.gpu /opt/mpi/${{ matrix.mpilib }}/bin/mpirun --oversubscribe -n 4 pytest -s -x mpitests/test_device.py -k test_create_device $COVERAGE_FLAGS
+
+      - name: Upload code coverage results
+        uses: actions/upload-artifact@v3
+        with:
+          name: ubuntu-codecov-results-python
+          path: coverage-${{ github.job }}-lightning_gpu_${{ matrix.mpilib }}-*.xml
+          if-no-files-found: error
+
+      - name: Cleanup
+        if: always()
+        run: |
+          rm -rf ${{ steps.setup_venv.outputs.venv_name }}
+          rm -rf * .git .gitignore .github
+          pip cache purge
+
   upload-to-codecov-linux-cpp:
     needs: ["cpp_tests"]
     name: Upload coverage data to codecov
@@ -182,4 +288,31 @@ jobs:
         run: |
           rm -rf ${{ steps.setup_venv.outputs.venv_name }}
           rm -rf * .git .gitignore .github
-          pip cache purge
\ No newline at end of file
+          pip cache purge
+
+  upload-to-codecov-linux-python:
+    needs: ["python_tests"]
+    name: Upload coverage data to codecov
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Download coverage reports
+        uses: actions/download-artifact@v3
+        with:
+          name: ubuntu-codecov-results-python
+
+      - name: Upload to Codecov
+        uses: codecov/codecov-action@v3
+        with:
+          fail_ci_if_error: true
+          verbose: true
+          token: ${{ secrets.CODECOV_TOKEN }}
+
+      - name: Cleanup
+        if: always()
+        run: |
+          rm -rf ${{ steps.setup_venv.outputs.venv_name }}
+          rm -rf * .git .gitignore .github
+          pip cache purge
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3268036a65..634565cded 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,4 +18,4 @@ repos:
           "-sn", # Don't display the score
           "--rcfile=.pylintrc", # Link to your config file
         ]
-      exclude: ^(bin/|doc/|scripts/|setup.py|tests/)
+      exclude: ^(bin/|doc/|scripts/|setup.py|tests/|mpitests/)
diff --git a/Makefile b/Makefile
index f8a62fc407..d07ae44dbc 100644
--- a/Makefile
+++ b/Makefile
@@ -110,7 +110,7 @@ format-cpp:
 	./bin/format $(CHECK) --cfversion $(if $(version:-=),$(version),0) ./pennylane_lightning
 
 format-python:
-	black -l 100 ./pennylane_lightning/ ./tests $(CHECK)
+	black -l 100 ./pennylane_lightning/ ./mpitests ./tests $(CHECK)
 
 .PHONY: check-tidy
 check-tidy:
diff --git a/mpitests/.pylintrc b/mpitests/.pylintrc
new file mode 100644
index 0000000000..3847296e98
--- /dev/null
+++ b/mpitests/.pylintrc
@@ -0,0 +1,52 @@
+[MASTER]
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code
+extension-pkg-whitelist=numpy,scipy,autograd,toml,appdir,autograd.numpy,autograd.numpy.linalg,autograd.numpy.builtins,semantic_version,torch,tensorflow,tensorflow.contrib,tensorflow.contrib.eager,LazyLoader,networkx,networkx.dag
+ignore-patterns=test_legacy*
+
+[TYPECHECK]
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=numpy,scipy,autograd,toml,appdir,autograd.numpy,autograd.numpy.linalg,autograd.numpy.builtins,semantic_version,torch,tensorflow,tensorflow.contrib,tensorflow.contrib.eager,LazyLoader,networkx,networkx.dag,math,pennylane.numpy
+
+# List of classes names for which member attributes should not be checked
+# (useful for classes with attributes dynamically set). This supports can work
+# with qualified names.
+ignored-classes=numpy,scipy,autograd,toml,appdir,autograd.numpy,autograd.numpy.linalg,autograd.numpy.builtins,semantic_version,torch,tensorflow,tensorflow.contrib,tensorflow.contrib.eager,LazyLoader,networkx,networkx.dag,math,pennylane.numpy,pennylane.numpy.random,pennylane.numpy.linalg,pennylane.numpy.builtins,pennylane.operation,rustworkx,kahypar
+
+[MESSAGES CONTROL]
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time.
+#enable=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once).
+# Cyclical import checks are disabled for now as they are frequently used in
+# the code base, but this can be removed in the future once cycles are resolved.
+disable=
+  line-too-long,
+  invalid-name,
+  too-many-lines,
+  redefined-builtin,
+  too-many-locals,
+  duplicate-code,
+  cyclic-import,
+  import-error,
+  bad-option-value,
+  import-outside-toplevel,
+  missing-class-docstring,
+  missing-function-docstring,
+  no-self-use
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=
diff --git a/mpitests/conftest.py b/mpitests/conftest.py
new file mode 100644
index 0000000000..09ab802b05
--- /dev/null
+++ b/mpitests/conftest.py
@@ -0,0 +1,120 @@
+# Copyright 2018-2023 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Pytest configuration file for PennyLane-Lightning-GPU test suite.
+"""
+# pylint: disable=missing-function-docstring,wrong-import-order,unused-import
+
+import itertools
+import os
+import pytest
+
+from pennylane import numpy as np
+import pennylane as qml
+
+# Tuple passed to distributed device ctor
+# np.complex for data type and True or False
+# for enabling MPI or not.
+fixture_params = itertools.product(
+    [np.complex64, np.complex128],
+    [True, False],
+)
+
+# defaults
+TOL = 1e-6
+TOL_STOCHASTIC = 0.05
+
+U = np.array(
+    [
+        [0.83645892 - 0.40533293j, -0.20215326 + 0.30850569j],
+        [-0.23889780 - 0.28101519j, -0.88031770 - 0.29832709j],
+    ]
+)
+
+U2 = np.array([[0, 1, 1, 1], [1, 0, 1, -1], [1, -1, 0, 1], [1, 1, -1, 0]]) / np.sqrt(3)
+A = np.array([[1.02789352, 1.61296440 - 0.3498192j], [1.61296440 + 0.3498192j, 1.23920938 + 0j]])
+
+THETA = np.linspace(0.11, 1, 3)
+PHI = np.linspace(0.32, 1, 3)
+VARPHI = np.linspace(0.02, 1, 3)
+
+
+@pytest.fixture(scope="session")
+def tol():
+    """Numerical tolerance for equality tests."""
+    return float(os.environ.get("TOL", TOL))
+
+
+@pytest.fixture(scope="session", params=[2, 3])
+def n_subsystems(request):
+    """Number of qubits or qumodes."""
+    return request.param
+
+
+# Looking for the device for testing.
+default_device = "lightning.gpu"
+supported_devices = {"lightning.gpu"}
+supported_devices.update({sb.replace(".", "_") for sb in supported_devices})
+
+
+def get_device():
+    """Return the pennylane lightning device.
+
+    The device is ``lightning.gpu`` by default.
+    Allowed values are: "lightning.gpu".
+    An underscore can also be used instead of a dot.
+    If the environment variable ``PL_DEVICE`` is defined, its value is used.
+    Underscores are replaced by dots upon exiting.
+    """
+    device = None
+    if "PL_DEVICE" in os.environ:
+        device = os.environ.get("PL_DEVICE", default_device)
+        device = device.replace("_", ".")
+    if device is None:
+        device = default_device
+    if device not in supported_devices:
+        raise ValueError(f"Invalid backend {device}.")
+    return device
+
+
+device_name = get_device()
+
+if device_name not in qml.plugin_devices:
+    raise qml.DeviceError(
+        f"Device {device_name} does not exist. Make sure the required plugin is installed."
+    )
+
+# Device specification
+if device_name == "lightning.gpu":
+    from pennylane_lightning.lightning_gpu import LightningGPU as LightningDevice
+else:
+    raise qml.DeviceError(f"The MPI tests do not apply to the {device_name} device.")
+
+
+# General qubit_device fixture, for any number of wires.
+@pytest.fixture(
+    scope="function",
+    params=fixture_params,
+)
+def qubit_device(request):
+    def _device(wires):
+        return qml.device(
+            device_name,
+            wires=wires,
+            mpi=True,
+            c_dtype=request.param[0],
+            batch_obs=request.param[1],
+        )
+
+    return _device
diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
new file mode 100644
index 0000000000..3657c336f8
--- /dev/null
+++ b/mpitests/test_adjoint_jacobian.py
@@ -0,0 +1,1364 @@
+# Copyright 2018-2023 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for the :mod:`pennylane_lightning_gpu.LightningGPU` device (MPI).
+"""
+# pylint: disable=protected-access,cell-var-from-loop,c-extension-no-member
+import itertools
+import math
+from mpi4py import MPI
+import pytest
+from conftest import device_name, LightningDevice as ld
+
+from scipy.stats import unitary_group
+import pennylane as qml
+from pennylane import numpy as np
+from pennylane import QNode, qnode
+
+I, X, Y, Z = (
+    np.eye(2),
+    qml.PauliX.compute_matrix(),
+    qml.PauliY.compute_matrix(),
+    qml.PauliZ.compute_matrix(),
+)
+
+# Tuple passed to distributed device ctor
+# np.complex for data type and True or False
+# for enabling batched_obs.
+fixture_params = itertools.product(
+    [np.complex64, np.complex128],
+    [True, False],
+)
+
+
+def Rx(theta):
+    r"""One-qubit rotation about the x axis.
+
+    Args:
+        theta (float): rotation angle
+    Returns:
+        array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_x \theta/2}`
+    """
+    return math.cos(theta / 2) * I + 1j * math.sin(-theta / 2) * X
+
+
+def Ry(theta):
+    r"""One-qubit rotation about the y axis.
+
+    Args:
+        theta (float): rotation angle
+    Returns:
+        array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_y \theta/2}`
+    """
+    return math.cos(theta / 2) * I + 1j * math.sin(-theta / 2) * Y
+
+
+def Rz(theta):
+    r"""One-qubit rotation about the z axis.
+
+    Args:
+        theta (float): rotation angle
+    Returns:
+        array: unitary 2x2 rotation matrix :math:`e^{-i \sigma_z \theta/2}`
+    """
+    return math.cos(theta / 2) * I + 1j * math.sin(-theta / 2) * Z
+
+
+class TestAdjointJacobian:  # pylint: disable=too-many-public-methods
+    """Tests for the adjoint_jacobian method"""
+
+    @pytest.fixture(params=fixture_params)
+    def dev(self, request):
+        """Returns a PennyLane device."""
+        return qml.device(
+            device_name,
+            wires=8,
+            mpi=True,
+            c_dtype=request.param[0],
+            batch_obs=request.param[1],
+        )
+
+    def test_not_expval(self, dev):
+        """Test if a QuantumFunctionError is raised for a tape with measurements that are not
+        expectation values"""
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.1, wires=0)
+            qml.var(qml.PauliZ(0))
+
+        with pytest.raises(
+            qml.QuantumFunctionError, match="Adjoint differentiation method does not"
+        ):
+            dev.adjoint_jacobian(tape)
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.1, wires=0)
+            qml.state()
+
+        if device_name == "lightning.gpu" and ld._CPP_BINARY_AVAILABLE:
+            message = "Adjoint differentiation does not support State measurements."
+        elif ld._CPP_BINARY_AVAILABLE:
+            message = "This method does not support statevector return type."
+        else:
+            message = "Adjoint differentiation method does not support measurement StateMP"
+        with pytest.raises(
+            qml.QuantumFunctionError,
+            match=message,
+        ):
+            dev.adjoint_jacobian(tape)
+
+    def test_finite_shots_warns(self):
+        """Tests warning raised when finite shots specified"""
+
+        dev = qml.device(device_name, wires=8, mpi=True, shots=1)
+
+        with qml.tape.QuantumTape() as tape:
+            qml.expval(qml.PauliZ(0))
+
+        with pytest.warns(
+            UserWarning,
+            match="Requested adjoint differentiation to be computed with finite shots.",
+        ):
+            dev.adjoint_jacobian(tape)
+
+    def test_empty_measurements(self, dev):
+        """Tests if an empty array is returned when the measurements of the tape is empty."""
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=[0])
+
+        jac = dev.adjoint_jacobian(tape)
+        assert len(jac) == 0
+
+    @pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_unsupported_op(self, dev):
+        """Test if a QuantumFunctionError is raised for an unsupported operation, i.e.,
+        multi-parameter operations that are not qml.Rot"""
+
+        with qml.tape.QuantumTape() as tape:
+            qml.CRot(0.1, 0.2, 0.3, wires=[0, 1])
+            qml.expval(qml.PauliZ(0))
+
+        with pytest.raises(
+            qml.QuantumFunctionError,
+            match="The CRot operation is not supported using the",
+        ):
+            dev.adjoint_jacobian(tape)
+
+    @pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_proj_unsupported(self, dev):
+        """Test if a QuantumFunctionError is raised for a Projector observable"""
+        with qml.tape.QuantumTape() as tape:
+            qml.CRX(0.1, wires=[0, 1])
+            qml.expval(qml.Projector([0, 1], wires=[0, 1]))
+
+        with pytest.raises(
+            qml.QuantumFunctionError,
+            match="differentiation method does not support the Projector",
+        ):
+            dev.adjoint_jacobian(tape)
+
+        with qml.tape.QuantumTape() as tape:
+            qml.CRX(0.1, wires=[0, 1])
+            qml.expval(qml.Projector([0], wires=[0]) @ qml.PauliZ(0))
+
+        with pytest.raises(
+            qml.QuantumFunctionError,
+            match="differentiation method does not support the Projector",
+        ):
+            dev.adjoint_jacobian(tape)
+
+    @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
+    @pytest.mark.parametrize("G", [qml.RX, qml.RY, qml.RZ])
+    @pytest.mark.parametrize("stateprep", [qml.QubitStateVector, qml.StatePrep])
+    def test_pauli_rotation_gradient(self, stateprep, G, theta, dev):
+        """Tests that the automatic gradients of Pauli rotations are correct."""
+        random_state = np.array(
+            [0.43593284 - 0.02945156j, 0.40812291 + 0.80158023j], requires_grad=False
+        )
+
+        tape = qml.tape.QuantumScript(
+            [G(theta, 0)], [qml.expval(qml.PauliZ(0))], [stateprep(random_state, 0)]
+        )
+
+        tape.trainable_params = {1}
+
+        calculated_val = dev.adjoint_jacobian(tape)
+
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        # compare to finite differences
+        tapes, fn = qml.gradients.param_shift(tape)
+        numeric_val = fn(qml.execute(tapes, dev, None))
+        assert np.allclose(calculated_val, numeric_val, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
+    @pytest.mark.parametrize("stateprep", [qml.QubitStateVector, qml.StatePrep])
+    def test_Rot_gradient(self, stateprep, theta, dev):
+        """Tests that the device gradient of an arbitrary Euler-angle-parameterized gate is
+        correct."""
+        params = np.array([theta, theta**3, np.sqrt(2) * theta])
+
+        with qml.tape.QuantumTape() as tape:
+            stateprep(np.array([1.0, -1.0], requires_grad=False) / np.sqrt(2), wires=0)
+            qml.Rot(*params, wires=[0])
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {1, 2, 3}
+
+        calculated_val = dev.adjoint_jacobian(tape)
+
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        # compare to finite differences
+        tapes, fn = qml.gradients.param_shift(tape)
+        numeric_val = fn(qml.execute(tapes, dev, None))
+        assert np.allclose(calculated_val, numeric_val, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize("par", [1, -2, 1.623, -0.051, 0])  # integers, floats, zero
+    def test_ry_gradient(self, par, tol, dev):
+        """Test that the gradient of the RY gate matches the exact analytic formula."""
+        with qml.tape.QuantumTape() as tape:
+            qml.RY(par, wires=[0])
+            qml.expval(qml.PauliX(0))
+
+        tape.trainable_params = {0}
+
+        # gradients
+        exact = np.cos(par)
+        grad_A = dev.adjoint_jacobian(tape)
+
+        # different methods must agree
+        assert np.allclose(grad_A, exact, atol=tol, rtol=0)
+
+    def test_rx_gradient(self, tol, dev):
+        """Test that the gradient of the RX gate matches the known formula."""
+        a = 0.7418
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(a, wires=0)
+            qml.expval(qml.PauliZ(0))
+
+        # circuit jacobians
+        dev_jacobian = dev.adjoint_jacobian(tape)
+        expected_jacobian = -np.sin(a)
+        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
+
+    def test_multiple_rx_gradient_pauliz(self, tol, dev):
+        """Tests that the gradient of multiple RX gates in a circuit yields the correct result."""
+        params = np.array([np.pi, np.pi / 2, np.pi / 3])
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(params[0], wires=0)
+            qml.RX(params[1], wires=1)
+            qml.RX(params[2], wires=2)
+
+            for idx in range(3):
+                qml.expval(qml.PauliZ(idx))
+
+        # circuit jacobians
+        dev_jacobian = dev.adjoint_jacobian(tape)
+        expected_jacobian = -np.diag(np.sin(params))
+        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
+
+    def test_multiple_rx_gradient_hermitian(self, tol, dev):
+        """Tests that the gradient of multiple RX gates in a circuit yields the correct result
+        with Hermitian observable
+        """
+        params = np.array([np.pi, np.pi / 2, np.pi / 3])
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(params[0], wires=0)
+            qml.RX(params[1], wires=1)
+            qml.RX(params[2], wires=2)
+
+            for idx in range(3):
+                qml.expval(qml.Hermitian([[1, 0], [0, -1]], wires=[idx]))
+
+        tape.trainable_params = {0, 1, 2}
+        # circuit jacobians
+        dev_jacobian = dev.adjoint_jacobian(tape)
+        expected_jacobian = -np.diag(np.sin(params))
+
+        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
+
+    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]  # pylint: disable=no-member
+    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
+
+    def test_multiple_rx_gradient_expval_hermitian(self, tol, dev):
+        """Tests that the gradient of multiple RX gates in a circuit yields the correct result
+        with Hermitian observable
+        """
+        params = np.array([np.pi / 3, np.pi / 4, np.pi / 5])
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(params[0], wires=0)
+            qml.RX(params[1], wires=1)
+            qml.RX(params[2], wires=2)
+
+            qml.expval(
+                qml.Hermitian(
+                    [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]],
+                    wires=[0, 2],
+                )
+            )
+
+        tape.trainable_params = {0, 1, 2}
+        dev_jacobian = dev.adjoint_jacobian(tape)
+        expected_jacobian = np.array(
+            [
+                -np.sin(params[0]) * np.cos(params[2]),
+                0,
+                -np.cos(params[0]) * np.sin(params[2]),
+            ]
+        )
+
+        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
+
+    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]  # pylint: disable=no-member
+    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
+
+    @pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_multiple_rx_gradient_expval_hamiltonian(self, tol, dev):
+        """Tests that the gradient of multiple RX gates in a circuit yields the correct result
+        with Hermitian observable
+        """
+        params = np.array([np.pi / 3, np.pi / 4, np.pi / 5])
+
+        ham = qml.Hamiltonian(
+            [1.0, 0.3, 0.3, 0.4],
+            [
+                qml.PauliX(0) @ qml.PauliX(1),
+                qml.PauliZ(0),
+                qml.PauliZ(1),
+                qml.Hermitian(
+                    [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]],
+                    wires=[0, 2],
+                ),
+            ],
+        )
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(params[0], wires=0)
+            qml.RX(params[1], wires=1)
+            qml.RX(params[2], wires=2)
+
+            qml.expval(ham)
+
+        tape.trainable_params = {0, 1, 2}
+        dev_jacobian = dev.adjoint_jacobian(tape)
+        expected_jacobian = (
+            0.3 * np.array([-np.sin(params[0]), 0, 0])
+            + 0.3 * np.array([0, -np.sin(params[1]), 0])
+            + 0.4
+            * np.array(
+                [
+                    -np.sin(params[0]) * np.cos(params[2]),
+                    0,
+                    -np.cos(params[0]) * np.sin(params[2]),
+                ]
+            )
+        )
+
+        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
+
+    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]  # pylint: disable=no-member
+    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
+
+    @pytest.mark.parametrize("obs", [qml.PauliX, qml.PauliY])
+    @pytest.mark.parametrize(
+        "op",
+        [
+            qml.RX(0.4, wires=0),
+            qml.RY(0.6, wires=0),
+            qml.RZ(0.8, wires=0),
+            qml.CRX(1.0, wires=[0, 1]),
+            qml.CRY(2.0, wires=[0, 1]),
+            qml.CRZ(3.0, wires=[0, 1]),
+            qml.Rot(0.2, -0.1, 0.2, wires=0),
+        ],
+    )
+    def test_gradients_pauliz(self, op, obs, dev):
+        """Tests that the gradients of circuits match between the finite difference and device
+        methods."""
+        # op.num_wires and op.num_params must be initialized a priori
+        with qml.tape.QuantumTape() as tape:
+            qml.Hadamard(wires=0)
+            qml.RX(0.543, wires=0)
+            qml.CNOT(wires=[0, 1])
+
+            op  # pylint: disable=pointless-statement
+
+            qml.Rot(1.3, -2.3, 0.5, wires=[0])
+            qml.RZ(-0.5, wires=0)
+            qml.adjoint(qml.RY(0.5, wires=1), lazy=False)
+            qml.CNOT(wires=[0, 1])
+
+            qml.expval(obs(wires=0))
+            qml.expval(qml.PauliZ(wires=1))
+
+        tape.trainable_params = set(range(1, 1 + op.num_params))
+
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        # pylint: disable=unnecessary-direct-lambda-call
+        grad_F = (lambda t, fn: fn(qml.execute(t, dev, None)))(*qml.gradients.param_shift(tape))
+        grad_D = dev.adjoint_jacobian(tape)
+
+        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize(
+        "op",
+        [
+            qml.RX(0.4, wires=0),
+            qml.RY(0.6, wires=0),
+            qml.RZ(0.8, wires=0),
+            qml.CRX(1.0, wires=[0, 1]),
+            qml.CRY(2.0, wires=[0, 1]),
+            qml.CRZ(3.0, wires=[0, 1]),
+            qml.Rot(0.2, -0.1, 0.2, wires=0),
+        ],
+    )
+    def test_gradients_hermitian(self, op, dev):
+        """Tests that the gradients of circuits match between the finite difference and device
+        methods."""
+        # op.num_wires and op.num_params must be initialized a priori
+        with qml.tape.QuantumTape() as tape:
+            qml.Hadamard(wires=0)
+            qml.RX(0.543, wires=0)
+            qml.CNOT(wires=[0, 1])
+
+            op.queue()
+
+            qml.Rot(1.3, -2.3, 0.5, wires=[0])
+            qml.RZ(-0.5, wires=0)
+            qml.adjoint(qml.RY(0.5, wires=1), lazy=False)
+            qml.CNOT(wires=[0, 1])
+
+            qml.expval(
+                qml.Hermitian(
+                    [[0, 0, 1, 1], [0, 1, 2, 1], [1, 2, 1, 0], [1, 1, 0, 0]],
+                    wires=[0, 1],
+                )
+            )
+
+        tape.trainable_params = set(range(1, 1 + op.num_params))
+
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        # pylint: disable=unnecessary-direct-lambda-call
+        grad_F = (lambda t, fn: fn(qml.execute(t, dev, None)))(*qml.gradients.param_shift(tape))
+        grad_D = dev.adjoint_jacobian(tape)
+
+        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
+
+    def test_gradient_gate_with_multiple_parameters_pauliz(self, dev):
+        """Tests that gates with multiple free parameters yield correct gradients."""
+        x, y, z = [0.5, 0.3, -0.7]
+
+        tape = qml.tape.QuantumScript(
+            [
+                qml.RX(0.4, wires=[0]),
+                qml.Rot(x, y, z, wires=[0]),
+                qml.RY(-0.2, wires=[0]),
+            ],
+            [qml.expval(qml.PauliZ(0))],
+        )
+
+        tape.trainable_params = {1, 2, 3}
+
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        grad_D = dev.adjoint_jacobian(tape)
+        tapes, fn = qml.gradients.param_shift(tape)
+        grad_F = fn(qml.execute(tapes, dev, None))
+
+        # gradient has the correct shape and every element is nonzero
+        assert len(grad_D) == 3
+        assert all(isinstance(v, np.ndarray) for v in grad_D)
+        assert np.count_nonzero(grad_D) == 3
+        # the different methods agree
+        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
+
+    def test_gradient_gate_with_multiple_parameters_hermitian(self, dev):
+        """Tests that gates with multiple free parameters yield correct gradients."""
+        x, y, z = [0.5, 0.3, -0.7]
+
+        tape = qml.tape.QuantumScript(
+            [
+                qml.RX(0.4, wires=[0]),
+                qml.Rot(x, y, z, wires=[0]),
+                qml.RY(-0.2, wires=[0]),
+            ],
+            [qml.expval(qml.Hermitian([[0, 1], [1, 1]], wires=0))],
+        )
+
+        tape.trainable_params = {1, 2, 3}
+
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        grad_D = dev.adjoint_jacobian(tape)
+        tapes, fn = qml.gradients.param_shift(tape)
+        grad_F = fn(qml.execute(tapes, dev, None))
+
+        # gradient has the correct shape and every element is nonzero
+        assert len(grad_D) == 3
+        assert all(isinstance(v, np.ndarray) for v in grad_D)
+        assert np.count_nonzero(grad_D) == 3
+        # the different methods agree
+        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
+
+    @pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_gradient_gate_with_multiple_parameters_hamiltonian(self, dev):
+        """Tests that gates with multiple free parameters yield correct gradients."""
+        x, y, z = [0.5, 0.3, -0.7]
+
+        ham = qml.Hamiltonian(
+            [1.0, 0.3, 0.3],
+            [qml.PauliX(0) @ qml.PauliX(1), qml.PauliZ(0), qml.PauliZ(1)],
+        )
+
+        tape = qml.tape.QuantumScript(
+            [
+                qml.RX(0.4, wires=[0]),
+                qml.Rot(x, y, z, wires=[0]),
+                qml.RY(-0.2, wires=[0]),
+            ],
+            [qml.expval(ham)],
+        )
+
+        tape.trainable_params = {1, 2, 3}
+
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        grad_D = dev.adjoint_jacobian(tape)
+        tapes, fn = qml.gradients.param_shift(tape)
+        grad_F = fn(qml.execute(tapes, dev, None))
+
+        # gradient has the correct shape and every element is nonzero
+        assert len(grad_D) == 3
+        assert all(isinstance(v, np.ndarray) for v in grad_D)
+        assert np.count_nonzero(grad_D) == 3
+        # the different methods agree
+        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
+
+    def test_use_device_state(self, tol, dev):
+        """Tests that when using the device state, the correct answer is still returned."""
+
+        x, y, z = [0.5, 0.3, -0.7]
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {1, 2, 3}
+
+        dM1 = dev.adjoint_jacobian(tape)
+
+        qml.execute([tape], dev, None)
+        dM2 = dev.adjoint_jacobian(tape, use_device_state=True)
+
+        assert np.allclose(dM1, dM2, atol=tol, rtol=0)
+
+    def test_provide_starting_state(self, tol, dev):
+        """Tests provides correct answer when provided starting state."""
+        comm = MPI.COMM_WORLD
+
+        x, y, z = [0.5, 0.3, -0.7]
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {1, 2, 3}
+
+        dM1 = dev.adjoint_jacobian(tape)
+
+        if device_name == "lightning.gpu":
+            local_state_vector = dev.state
+            complex_type = np.complex128 if dev.R_DTYPE == np.float64 else np.complex64
+            state_vector = np.zeros(1 << 8).astype(complex_type)
+            comm.Allgather(local_state_vector, state_vector)
+            qml.execute([tape], dev, None)
+            dM2 = dev.adjoint_jacobian(tape, starting_state=state_vector)
+            assert np.allclose(dM1, dM2, atol=tol, rtol=0)
+
+    @pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_provide_wrong_starting_state(self, dev):
+        """Tests raise an exception when provided starting state mismatches."""
+        x, y, z = [0.5, 0.3, -0.7]
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {1, 2, 3}
+
+        with pytest.raises(
+            qml.QuantumFunctionError,
+            match="The number of qubits of starting_state must be the same as",
+        ):
+            dev.adjoint_jacobian(tape, starting_state=np.ones(7))
+
+    @pytest.mark.skipif(
+        device_name == "lightning.gpu",
+        reason="Adjoint differentiation does not support State measurements.",
+    )
+    @pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_state_return_type(self, dev):
+        """Tests raise an exception when the return type is State"""
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=[0])
+            qml.state()
+
+        tape.trainable_params = {0}
+
+        with pytest.raises(
+            qml.QuantumFunctionError,
+            match="This method does not support statevector return type.",
+        ):
+            dev.adjoint_jacobian(tape)
+
+
+class TestAdjointJacobianQNode:
+    """Test QNode integration with the adjoint_jacobian method"""
+
+    @pytest.fixture(params=fixture_params)
+    def dev(self, request):
+        """Returns a PennyLane device."""
+        return qml.device(
+            device_name,
+            wires=8,
+            mpi=True,
+            c_dtype=request.param[0],
+            batch_obs=request.param[1],
+        )
+
+    def test_finite_shots_warning(self):
+        """Tests that a warning is raised when computing the adjoint diff on a device with finite shots"""
+
+        dev = qml.device(device_name, wires=8, mpi=True, shots=1)
+
+        with pytest.warns(
+            UserWarning,
+            match="Requested adjoint differentiation to be computed with finite shots.",
+        ):
+
+            @qml.qnode(dev, diff_method="adjoint")
+            def circ(x):
+                qml.RX(x, wires=0)
+                return qml.expval(qml.PauliZ(0))
+
+        with pytest.warns(
+            UserWarning,
+            match="Requested adjoint differentiation to be computed with finite shots.",
+        ):
+            qml.grad(circ)(0.1)
+
+    @pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_qnode(self, mocker, dev):
+        """Test that specifying diff_method allows the adjoint method to be selected"""
+        args = np.array([0.54, 0.1, 0.5], requires_grad=True)
+
+        def circuit(x, y, z):
+            qml.Hadamard(wires=0)
+            qml.RX(0.543, wires=0)
+            qml.CNOT(wires=[0, 1])
+
+            qml.Rot(x, y, z, wires=0)
+
+            qml.Rot(1.3, -2.3, 0.5, wires=[0])
+            qml.RZ(-0.5, wires=0)
+            qml.RY(0.5, wires=1)
+            qml.CNOT(wires=[0, 1])
+
+            return qml.expval(qml.PauliX(0) @ qml.PauliZ(1))
+
+        qnode1 = QNode(circuit, dev, diff_method="adjoint")
+        spy = mocker.spy(dev, "adjoint_jacobian")
+
+        grad_fn = qml.grad(qnode1)
+        grad_A = grad_fn(*args)
+
+        spy.assert_called()
+
+        h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        qnode2 = QNode(circuit, dev, diff_method="finite-diff", h=h)
+        grad_fn = qml.grad(qnode2)
+        grad_F = grad_fn(*args)
+
+        assert np.allclose(grad_A, grad_F, atol=tol, rtol=0)
+
+    thetas = np.linspace(-2 * np.pi, 2 * np.pi, 8)
+
+    @pytest.mark.parametrize("reused_p", thetas**3 / 19)
+    @pytest.mark.parametrize("other_p", thetas**2 / 1)
+    def test_fanout_multiple_params(
+        self, reused_p, other_p, tol, mocker, dev
+    ):  # pylint: disable=too-many-arguments
+        """Tests that the correct gradient is computed for qnodes which
+        use the same parameter in multiple gates."""
+
+        def expZ(state):
+            return np.abs(state[0]) ** 2 - np.abs(state[1]) ** 2
+
+        extra_param = np.array(0.31, requires_grad=False)
+
+        @qnode(dev, diff_method="adjoint")
+        def cost(p1, p2):
+            qml.RX(extra_param, wires=[0])
+            qml.RY(p1, wires=[0])
+            qml.RZ(p2, wires=[0])
+            qml.RX(p1, wires=[0])
+            return qml.expval(qml.PauliZ(0))
+
+        zero_state = np.array([1.0, 0.0])
+        cost(reused_p, other_p)
+
+        spy = mocker.spy(dev, "adjoint_jacobian")
+
+        # analytic gradient
+        grad_fn = qml.grad(cost)
+        grad_D = grad_fn(reused_p, other_p)
+
+        spy.assert_called_once()
+
+        # manual gradient
+        grad_true0 = (
+            expZ(
+                Rx(reused_p) @ Rz(other_p) @ Ry(reused_p + np.pi / 2) @ Rx(extra_param) @ zero_state
+            )
+            - expZ(
+                Rx(reused_p) @ Rz(other_p) @ Ry(reused_p - np.pi / 2) @ Rx(extra_param) @ zero_state
+            )
+        ) / 2
+        grad_true1 = (
+            expZ(
+                Rx(reused_p + np.pi / 2) @ Rz(other_p) @ Ry(reused_p) @ Rx(extra_param) @ zero_state
+            )
+            - expZ(
+                Rx(reused_p - np.pi / 2) @ Rz(other_p) @ Ry(reused_p) @ Rx(extra_param) @ zero_state
+            )
+        ) / 2
+        expected = grad_true0 + grad_true1  # product rule
+
+        assert np.allclose(grad_D[0], expected, atol=tol, rtol=0)
+
+    @pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_gradient_repeated_gate_parameters(self, mocker, dev):
+        """Tests that repeated use of a free parameter in a multi-parameter gate yields correct
+        gradients."""
+        params = np.array([0.8, 1.3], requires_grad=True)
+
+        def circuit(params):
+            qml.RX(np.array(np.pi / 4, requires_grad=False), wires=[0])
+            qml.Rot(params[1], params[0], 2 * params[0], wires=[0])
+            return qml.expval(qml.PauliX(0))
+
+        spy_analytic = mocker.spy(dev, "adjoint_jacobian")
+
+        h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        cost = QNode(circuit, dev, diff_method="finite-diff", h=h)
+
+        grad_fn = qml.grad(cost)
+        grad_F = grad_fn(params)
+
+        spy_analytic.assert_not_called()
+
+        cost = QNode(circuit, dev, diff_method="adjoint")
+        grad_fn = qml.grad(cost)
+        grad_D = grad_fn(params)
+
+        spy_analytic.assert_called_once()
+
+        # the different methods agree
+        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
+
+    def test_interface_tf(self, dev):
+        """Test if gradients agree between the adjoint and finite-diff methods when using the
+        TensorFlow interface"""
+
+        tf = pytest.importorskip("tensorflow")
+
+        def f(params1, params2):
+            qml.RX(0.4, wires=[0])
+            qml.RZ(params1 * tf.sqrt(params2), wires=[0])
+            qml.RY(tf.cos(params2), wires=[0])
+            return qml.expval(qml.PauliZ(0))
+
+        if dev.R_DTYPE == np.float32:
+            tf_r_dtype = tf.float32
+        else:
+            tf_r_dtype = tf.float64
+
+        params1 = tf.Variable(0.3, dtype=tf_r_dtype)
+        params2 = tf.Variable(0.4, dtype=tf_r_dtype)
+
+        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        qnode1 = QNode(f, dev, interface="tf", diff_method="adjoint")
+        qnode2 = QNode(f, dev, interface="tf", diff_method="finite-diff", h=h)
+
+        with tf.GradientTape() as tape:
+            res1 = qnode1(params1, params2)
+
+        g1 = tape.gradient(res1, [params1, params2])
+
+        with tf.GradientTape() as tape:
+            res2 = qnode2(params1, params2)
+
+        g2 = tape.gradient(res2, [params1, params2])
+
+        assert np.allclose(g1, g2, atol=tol)
+
+    def test_interface_torch(self, dev):
+        """Test if gradients agree between the adjoint and finite-diff methods when using the
+        Torch interface"""
+
+        torch = pytest.importorskip("torch")
+
+        def f(params1, params2):
+            qml.RX(0.4, wires=[0])
+            qml.RZ(params1 * torch.sqrt(params2), wires=[0])
+            qml.RY(torch.cos(params2), wires=[0])
+            return qml.expval(qml.PauliZ(0))
+
+        params1 = torch.tensor(0.3, requires_grad=True)
+        params2 = torch.tensor(0.4, requires_grad=True)
+
+        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        qnode1 = QNode(f, dev, interface="torch", diff_method="adjoint")
+        qnode2 = QNode(f, dev, interface="torch", diff_method="finite-diff", h=h)
+
+        res1 = qnode1(params1, params2)
+        res1.backward()
+
+        grad_adjoint = params1.grad, params2.grad
+
+        res2 = qnode2(params1, params2)
+        res2.backward()
+
+        grad_fd = params1.grad, params2.grad
+
+        assert np.allclose(grad_adjoint, grad_fd)
+
+    def test_interface_jax(self, dev):
+        """Test if the gradients agree between adjoint and finite-difference methods in the
+        jax interface"""
+
+        jax = pytest.importorskip("jax")
+        if dev.R_DTYPE == np.float64:
+            from jax.config import config  # pylint: disable=import-outside-toplevel
+
+            config.update("jax_enable_x64", True)
+
+        def f(params1, params2):
+            qml.RX(0.4, wires=[0])
+            qml.RZ(params1 * jax.numpy.sqrt(params2), wires=[0])
+            qml.RY(jax.numpy.cos(params2), wires=[0])
+            return qml.expval(qml.PauliZ(0))
+
+        params1 = jax.numpy.array(0.3, dev.R_DTYPE)
+        params2 = jax.numpy.array(0.4, dev.R_DTYPE)
+
+        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        qnode_adjoint = QNode(f, dev, interface="jax", diff_method="adjoint")
+        qnode_fd = QNode(f, dev, interface="jax", diff_method="finite-diff", h=h)
+
+        grad_adjoint = jax.grad(qnode_adjoint)(params1, params2)
+        grad_fd = jax.grad(qnode_fd)(params1, params2)
+
+        assert np.allclose(grad_adjoint, grad_fd, atol=tol)
+
+
+def circuit_ansatz(params, wires):
+    """Circuit ansatz containing all the parametrized gates"""
+    qml.QubitStateVector(unitary_group.rvs(2**8, random_state=0)[0], wires=wires)
+    qml.RX(params[0], wires=wires[0])
+    qml.RY(params[1], wires=wires[1])
+    qml.adjoint(qml.RX(params[2], wires=wires[2]))
+    qml.RZ(params[0], wires=wires[3])
+    qml.CRX(params[3], wires=[wires[3], wires[0]])
+    qml.PhaseShift(params[4], wires=wires[2])
+    qml.CRY(params[5], wires=[wires[2], wires[1]])
+    qml.adjoint(qml.CRZ(params[5], wires=[wires[0], wires[3]]))
+    qml.adjoint(qml.PhaseShift(params[6], wires=wires[0]))
+    qml.Rot(params[6], params[7], params[8], wires=wires[0])
+    qml.adjoint(qml.Rot(params[8], params[8], params[9], wires=wires[1]))
+    qml.MultiRZ(params[11], wires=[wires[0], wires[1]])
+    qml.PauliRot(params[12], "XXYZ", wires=[wires[0], wires[1], wires[2], wires[3]])
+    qml.CPhase(params[12], wires=[wires[3], wires[2]])
+    qml.IsingXX(params[13], wires=[wires[1], wires[0]])
+    qml.IsingXY(params[14], wires=[wires[3], wires[2]])
+    qml.IsingYY(params[14], wires=[wires[3], wires[2]])
+    qml.IsingZZ(params[14], wires=[wires[2], wires[1]])
+    qml.U1(params[15], wires=wires[0])
+    qml.U2(params[16], params[17], wires=wires[0])
+    qml.U3(params[18], params[19], params[20], wires=wires[1])
+    qml.adjoint(qml.CRot(params[21], params[22], params[23], wires=[wires[1], wires[2]]))
+    qml.SingleExcitation(params[24], wires=[wires[2], wires[0]])
+    qml.DoubleExcitation(params[25], wires=[wires[2], wires[0], wires[1], wires[3]])
+    qml.SingleExcitationPlus(params[26], wires=[wires[0], wires[2]])
+    qml.SingleExcitationMinus(params[27], wires=[wires[0], wires[2]])
+    qml.DoubleExcitationPlus(params[27], wires=[wires[2], wires[0], wires[1], wires[3]])
+    qml.DoubleExcitationMinus(params[27], wires=[wires[2], wires[0], wires[1], wires[3]])
+    qml.RX(params[28], wires=wires[0])
+    qml.RX(params[29], wires=wires[1])
+
+
+@pytest.mark.parametrize(
+    "returns",
+    [
+        qml.PauliZ(0),
+        qml.PauliX(2),
+        qml.PauliZ(0) @ qml.PauliY(3),
+        qml.Hadamard(2),
+        qml.Hadamard(3) @ qml.PauliZ(2),
+        qml.PauliX(0) @ qml.PauliY(3),
+        qml.PauliY(0) @ qml.PauliY(2) @ qml.PauliY(3),
+        qml.Hermitian(
+            np.kron(qml.PauliY.compute_matrix(), qml.PauliZ.compute_matrix()),
+            wires=[3, 2],
+        ),
+        qml.Hermitian(np.array([[0, 1], [1, 0]], requires_grad=False), wires=0),
+        qml.Hermitian(np.array([[0, 1], [1, 0]], requires_grad=False), wires=0) @ qml.PauliZ(2),
+    ],
+)
+def test_integration(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations"""
+    dev_def = qml.device("default.qubit", wires=range(8))
+    dev_lightning = qml.device(device_name, wires=range(8), mpi=True)
+
+    def circuit(params):
+        circuit_ansatz(params, wires=range(8))
+        return qml.expval(returns), qml.expval(qml.PauliY(1))
+
+    n_params = 30
+    params = np.linspace(0, 10, n_params)
+
+    qnode_def = qml.QNode(circuit, dev_def)
+    qnode_lightning = qml.QNode(circuit, dev_lightning, diff_method="adjoint")
+
+    def casted_to_array_def(params):
+        return np.array(qnode_def(params))
+
+    def casted_to_array_lightning(params):
+        return np.array(qnode_lightning(params))
+
+    j_def = qml.jacobian(casted_to_array_def)(params)
+    j_lightning = qml.jacobian(casted_to_array_lightning)(params)
+
+    assert np.allclose(j_def, j_lightning)
+
+
+custom_wires = ["alice", 3.14, -1, 0, "bob", 1, "unit", "test"]
+
+
+@pytest.mark.parametrize(
+    "returns",
+    [
+        qml.PauliZ(custom_wires[0]),
+        qml.PauliX(custom_wires[2]),
+        qml.PauliZ(custom_wires[0]) @ qml.PauliY(custom_wires[3]),
+        qml.Hadamard(custom_wires[2]),
+        qml.Hadamard(custom_wires[3]) @ qml.PauliZ(custom_wires[2]),
+        # qml.Projector([0, 1], wires=[custom_wires[0], custom_wires[2]]) @ qml.Hadamard(custom_wires[3])
+        # qml.Projector([0, 0], wires=[custom_wires[2], custom_wires[0]])
+        qml.PauliX(custom_wires[0]) @ qml.PauliY(custom_wires[3]),
+        qml.PauliY(custom_wires[0]) @ qml.PauliY(custom_wires[2]) @ qml.PauliY(custom_wires[3]),
+        qml.Hermitian(np.array([[0, 1], [1, 0]], requires_grad=False), wires=custom_wires[0]),
+        qml.Hermitian(
+            np.kron(qml.PauliY.compute_matrix(), qml.PauliZ.compute_matrix()),
+            wires=[custom_wires[3], custom_wires[2]],
+        ),
+        qml.Hermitian(np.array([[0, 1], [1, 0]], requires_grad=False), wires=custom_wires[0])
+        @ qml.PauliZ(custom_wires[2]),
+    ],
+)
+def test_integration_custom_wires(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations and when using custom wire labels"""
+    dev_def = qml.device("default.qubit", wires=custom_wires)
+    dev_lightning = qml.device(device_name, wires=custom_wires, mpi=True, batch_obs=False)
+
+    def circuit(params):
+        circuit_ansatz(params, wires=custom_wires)
+        return qml.expval(returns), qml.expval(qml.PauliY(custom_wires[1]))
+
+    n_params = 30
+    params = np.linspace(0, 10, n_params)
+
+    qnode_def = qml.QNode(circuit, dev_def)
+    qnode_lightning = qml.QNode(circuit, dev_lightning, diff_method="adjoint")
+
+    def casted_to_array_def(params):
+        return np.array(qnode_def(params))
+
+    def casted_to_array_lightning(params):
+        return np.array(qnode_lightning(params))
+
+    j_def = qml.jacobian(casted_to_array_def)(params)
+    j_lightning = qml.jacobian(casted_to_array_lightning)(params)
+
+    assert np.allclose(j_def, j_lightning)
+
+
+@pytest.mark.parametrize(
+    "returns",
+    [
+        (qml.PauliZ(custom_wires[0]),),
+        (qml.PauliZ(custom_wires[0]), qml.PauliZ(custom_wires[1])),
+        (
+            qml.PauliZ(custom_wires[0]),
+            qml.PauliZ(custom_wires[1]),
+            qml.PauliZ(custom_wires[3]),
+        ),
+        (
+            qml.PauliZ(custom_wires[0]),
+            qml.PauliZ(custom_wires[1]),
+            qml.PauliZ(custom_wires[3]),
+            qml.PauliZ(custom_wires[2]),
+        ),
+        (
+            qml.PauliZ(custom_wires[0]) @ qml.PauliY(custom_wires[3]),
+            qml.PauliZ(custom_wires[1]) @ qml.PauliY(custom_wires[2]),
+        ),
+        (
+            qml.PauliZ(custom_wires[0]) @ qml.PauliY(custom_wires[3]),
+            qml.PauliZ(custom_wires[1]),
+        ),
+    ],
+)
+def test_integration_custom_wires_batching(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations and when using custom wire labels"""
+
+    dev_def = qml.device("default.qubit", wires=custom_wires)
+    dev_gpu = qml.device("lightning.gpu", wires=custom_wires, mpi=True, batch_obs=True)
+
+    def circuit(params):
+        circuit_ansatz(params, wires=custom_wires)
+        return [qml.expval(r) for r in returns] + [qml.expval(qml.PauliY(custom_wires[1]))]
+
+    n_params = 30
+    np.random.seed(1337)
+    params = np.random.rand(n_params)
+
+    qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="adjoint")
+    qnode_def = qml.QNode(circuit, dev_def)
+
+    def convert_to_array_gpu(params):
+        return np.hstack(qnode_gpu(params))
+
+    def convert_to_array_def(params):
+        return np.hstack(qnode_def(params))
+
+    j_gpu = qml.jacobian(convert_to_array_gpu)(params)
+    j_def = qml.jacobian(convert_to_array_def)(params)
+
+    assert np.allclose(j_gpu, j_def, atol=1e-7)
+
+
+@pytest.mark.parametrize(
+    "returns",
+    [
+        (0.5 * qml.PauliZ(custom_wires[0]),),
+        (0.5 * qml.PauliZ(custom_wires[0]), qml.PauliZ(custom_wires[1])),
+        (
+            qml.PauliZ(custom_wires[0]),
+            0.5 * qml.PauliZ(custom_wires[1]),
+            qml.PauliZ(custom_wires[3]),
+        ),
+        (
+            qml.PauliZ(custom_wires[0]),
+            qml.PauliZ(custom_wires[1]),
+            qml.PauliZ(custom_wires[3]),
+            0.5 * qml.PauliZ(custom_wires[2]),
+        ),
+        (
+            qml.PauliZ(custom_wires[0]) @ qml.PauliY(custom_wires[3]),
+            0.5 * qml.PauliZ(custom_wires[1]) @ qml.PauliY(custom_wires[2]),
+        ),
+        (
+            qml.PauliZ(custom_wires[0]) @ qml.PauliY(custom_wires[3]),
+            0.5 * qml.PauliZ(custom_wires[1]),
+        ),
+        (
+            0.0 * qml.PauliZ(custom_wires[0]) @ qml.PauliZ(custom_wires[1]),
+            1.0 * qml.Identity(10),
+            1.2 * qml.PauliZ(custom_wires[2]) @ qml.PauliZ(custom_wires[3]),
+        ),
+    ],
+)
+def test_batching_H(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations and when using custom wire labels"""
+
+    dev_cpu = qml.device("default.qubit", wires=custom_wires + [10, 72])
+    dev_gpu = qml.device(device_name, wires=custom_wires + [10, 72], batch_obs=True)
+    dev_gpu_default = qml.device(device_name, wires=custom_wires + [10, 72], batch_obs=False)
+
+    def circuit(params):
+        circuit_ansatz(params, wires=custom_wires)
+        return qml.math.hstack([qml.expval(r) for r in returns])
+
+    n_params = 30
+    np.random.seed(1337)
+    params = np.random.rand(n_params)
+
+    qnode_cpu = qml.QNode(circuit, dev_cpu, diff_method="parameter-shift")
+    qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="adjoint")
+    qnode_gpu_default = qml.QNode(circuit, dev_gpu_default, diff_method="adjoint")
+
+    j_cpu = qml.jacobian(qnode_cpu)(params)
+    j_gpu = qml.jacobian(qnode_gpu)(params)
+    j_gpu_default = qml.jacobian(qnode_gpu_default)(params)
+
+    assert np.allclose(j_cpu, j_gpu)
+    assert np.allclose(j_gpu, j_gpu_default)
+
+
+@pytest.fixture(scope="session")
+def create_xyz_file(tmp_path_factory):
+    """Creates a coordinate file for an H2 molecule in the XYZ format."""
+    directory = tmp_path_factory.mktemp("tmp")
+    file = directory / "h2.xyz"
+    file.write_text("""2\nH2, Unoptimized\nH  1.0 0.0 0.0\nH -1.0 0.0 0.0""")
+    yield file
+
+
+@pytest.mark.parametrize(
+    "batches",
+    [False, True, 1, 2, 3, 4],
+)
+def test_integration_H2_Hamiltonian(
+    create_xyz_file, batches
+):  # pylint: disable=redefined-outer-name
+    """Tests getting the total energy and its derivatives for an H2 Hamiltonian."""
+    _ = pytest.importorskip("openfermionpyscf")
+
+    n_electrons = 2
+    np.random.seed(1337)
+
+    str_path = create_xyz_file
+    symbols, coordinates = qml.qchem.read_structure(str(str_path), outpath=str(str_path.parent))
+
+    H, qubits = qml.qchem.molecular_hamiltonian(
+        symbols,
+        coordinates,
+        method="pyscf",
+        basis="6-31G",
+        active_electrons=n_electrons,
+        name="h2",
+        outpath=str(str_path.parent),
+        load_data=True,
+    )
+    hf_state = qml.qchem.hf_state(n_electrons, qubits)
+    _, doubles = qml.qchem.excitations(n_electrons, qubits)
+
+    # Choose different batching supports here
+    dev = qml.device(device_name, wires=qubits, mpi=True, batch_obs=batches)
+    dev_comp = qml.device("default.qubit", wires=qubits)
+
+    @qml.qnode(dev, diff_method="adjoint")
+    def circuit(params, excitations):
+        qml.BasisState(hf_state, wires=H.wires)
+        for i, excitation in enumerate(excitations):
+            if len(excitation) == 4:
+                qml.DoubleExcitation(params[i], wires=excitation)
+            else:
+                qml.SingleExcitation(params[i], wires=excitation)
+        return qml.expval(H)
+
+    @qml.qnode(dev_comp, diff_method="parameter-shift")
+    def circuit_compare(params, excitations):
+        qml.BasisState(hf_state, wires=H.wires)
+
+        for i, excitation in enumerate(excitations):
+            if len(excitation) == 4:
+                qml.DoubleExcitation(params[i], wires=excitation)
+            else:
+                qml.SingleExcitation(params[i], wires=excitation)
+        return qml.expval(H)
+
+    jac_func = qml.jacobian(circuit)
+    jac_func_comp = qml.jacobian(circuit_compare)
+
+    params = qml.numpy.array([0.0] * len(doubles), requires_grad=True)
+    jacs = jac_func(params, excitations=doubles)
+    jacs_comp = jac_func_comp(params, excitations=doubles)
+
+    assert np.allclose(jacs, jacs_comp)
+
+
+@pytest.mark.parametrize(
+    "returns",
+    [
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliY(wires=custom_wires[1])]
+            ).sparse_matrix(custom_wires),
+            wires=custom_wires,
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [2.0], [qml.PauliX(wires=custom_wires[2]) @ qml.PauliZ(wires=custom_wires[0])]
+            ).sparse_matrix(custom_wires),
+            wires=custom_wires,
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [2.0], [qml.PauliX(wires=custom_wires[1]) @ qml.PauliZ(wires=custom_wires[2])]
+            ).sparse_matrix(custom_wires),
+            wires=custom_wires,
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [1.1], [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[2])]
+            ).sparse_matrix(custom_wires),
+            wires=custom_wires,
+        ),
+    ],
+)
+def test_adjoint_SparseHamiltonian_custom_wires(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations and when using custom wire labels"""
+
+    comm = MPI.COMM_WORLD
+    dev_gpu = qml.device("lightning.gpu", wires=custom_wires, mpi=True)
+    dev_cpu = qml.device("default.qubit", wires=custom_wires)
+
+    def circuit(params):
+        circuit_ansatz(params, wires=custom_wires)
+        return qml.expval(returns)
+
+    if comm.Get_rank() == 0:
+        n_params = 30
+        np.random.seed(1337)
+        params = np.random.rand(n_params)
+    else:
+        params = None
+
+    params = comm.bcast(params, root=0)
+
+    qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="adjoint")
+    qnode_cpu = qml.QNode(circuit, dev_cpu, diff_method="parameter-shift")
+
+    j_gpu = qml.jacobian(qnode_gpu)(params)
+    j_cpu = qml.jacobian(qnode_cpu)(params)
+
+    assert np.allclose(j_cpu, j_gpu)
+
+
+@pytest.mark.parametrize(
+    "returns",
+    [
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliZ(1) @ qml.PauliX(0) @ qml.Identity(2) @ qml.PauliX(4) @ qml.Identity(5)],
+            ).sparse_matrix(range(len(custom_wires))),
+            wires=range(len(custom_wires)),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliX(1) @ qml.PauliZ(0)],
+            ).sparse_matrix(range(len(custom_wires))),
+            wires=range(len(custom_wires)),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliX(0)],
+            ).sparse_matrix(range(len(custom_wires))),
+            wires=range(len(custom_wires)),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliX(5)],
+            ).sparse_matrix(range(len(custom_wires))),
+            wires=range(len(custom_wires)),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliX(0) @ qml.PauliZ(1)],
+            ).sparse_matrix(range(len(custom_wires))),
+            wires=range(len(custom_wires)),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian([2.0], [qml.PauliX(1) @ qml.PauliZ(2)]).sparse_matrix(
+                range(len(custom_wires))
+            ),
+            wires=range(len(custom_wires)),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian([2.0], [qml.PauliX(2) @ qml.PauliZ(4)]).sparse_matrix(
+                range(len(custom_wires))
+            ),
+            wires=range(len(custom_wires)),
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian([1.1], [qml.PauliX(2) @ qml.PauliZ(0)]).sparse_matrix(
+                range(len(custom_wires))
+            ),
+            wires=range(len(custom_wires)),
+        ),
+    ],
+)
+def test_adjoint_SparseHamiltonian(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations and when using custom wire labels"""
+
+    comm = MPI.COMM_WORLD
+    dev_gpu = qml.device("lightning.gpu", wires=len(custom_wires), mpi=True)
+    dev_cpu = qml.device("default.qubit", wires=len(custom_wires))
+
+    def circuit(params):
+        circuit_ansatz(params, wires=range(len(custom_wires)))
+        return qml.expval(returns)
+
+    if comm.Get_rank() == 0:
+        n_params = 30
+        np.random.seed(1337)
+        params = np.random.rand(n_params)
+    else:
+        params = None
+
+    params = comm.bcast(params, root=0)
+
+    qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="adjoint")
+    qnode_cpu = qml.QNode(circuit, dev_cpu, diff_method="parameter-shift")
+
+    j_gpu = qml.jacobian(qnode_gpu)(params)
+    j_cpu = qml.jacobian(qnode_cpu)(params)
+
+    assert np.allclose(j_cpu, j_gpu)
diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py
new file mode 100644
index 0000000000..ad9e474fb4
--- /dev/null
+++ b/mpitests/test_apply.py
@@ -0,0 +1,1049 @@
+# Copyright 2018-2023 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for the :mod:`pennylane_lightning_gpu.LightningGPU` device (MPI).
+"""
+# pylint: disable=protected-access,cell-var-from-loop,c-extension-no-member
+import itertools
+from mpi4py import MPI
+import pytest
+
+from conftest import TOL_STOCHASTIC, device_name, fixture_params
+
+import numpy as np
+import pennylane as qml
+
+
+numQubits = 8
+
+# Tuple passed to distributed device ctor
+# np.complex for data type and True or False
+# for enabling batched_obs.
+fixture_params = itertools.product(
+    [np.complex64, np.complex128],
+    [True, False],
+)
+
+
+def create_random_init_state(numWires, R_DTYPE, seed_value=48):
+    """Returns a random initial state of a certain type."""
+    np.random.seed(seed_value)
+    num_elements = 1 << numWires
+    init_state = np.random.rand(num_elements).astype(R_DTYPE) + 1j * np.random.rand(
+        num_elements
+    ).astype(R_DTYPE)
+    scale_sum = np.sqrt(np.sum(np.abs(init_state) ** 2)).astype(R_DTYPE)
+    init_state = init_state / scale_sum
+    return init_state
+
+
+def apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires):
+    """Wrapper applying a parametric gate with QNode function."""
+    num_wires = numQubits
+    comm = MPI.COMM_WORLD
+    commSize = comm.Get_size()
+    num_global_wires = commSize.bit_length() - 1
+    num_local_wires = num_wires - num_global_wires
+
+    if dev_mpi.R_DTYPE == np.float32:
+        c_dtype = np.complex64
+    else:
+        c_dtype = np.complex128
+
+    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+
+    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    comm.Bcast(state_vector, root=0)
+
+    comm.Scatter(state_vector, local_state_vector, root=0)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+
+    def circuit(*params):
+        qml.StatePrep(state_vector, wires=range(num_wires))
+        operation(*params, wires=Wires)
+        return qml.state()
+
+    cpu_qnode = qml.QNode(circuit, dev_cpu)
+    expected_output_cpu = cpu_qnode(*par).astype(c_dtype)
+    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+    mpi_qnode = qml.QNode(circuit, dev_mpi)
+    local_state_vector = mpi_qnode(*par)
+
+    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+
+def apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires):
+    """Wrapper applying a parametric gate with the apply method."""
+    num_wires = numQubits
+    comm = MPI.COMM_WORLD
+    commSize = comm.Get_size()
+    num_global_wires = commSize.bit_length() - 1
+    num_local_wires = num_wires - num_global_wires
+
+    if dev_mpi.R_DTYPE == np.float32:
+        c_dtype = np.complex64
+    else:
+        c_dtype = np.complex128
+
+    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+
+    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    comm.Bcast(state_vector, root=0)
+
+    comm.Scatter(state_vector, local_state_vector, root=0)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+
+    @qml.qnode(dev_cpu)
+    def circuit(*params):
+        qml.StatePrep(state_vector, wires=range(num_wires))
+        operation(*params, wires=Wires)
+        return qml.state()
+
+    expected_output_cpu = np.array(circuit(*par)).astype(c_dtype)
+    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+    dev_mpi.syncH2D(local_state_vector)
+    dev_mpi.apply([operation(*par, wires=Wires)])
+    dev_mpi.syncD2H(local_state_vector)
+
+    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+
+def apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires):
+    """Wrapper applying a non-parametric gate with QNode function."""
+    num_wires = numQubits
+    comm = MPI.COMM_WORLD
+    commSize = comm.Get_size()
+    num_global_wires = commSize.bit_length() - 1
+    num_local_wires = num_wires - num_global_wires
+
+    if dev_mpi.R_DTYPE == np.float32:
+        c_dtype = np.complex64
+    else:
+        c_dtype = np.complex128
+
+    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+
+    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    comm.Bcast(state_vector, root=0)
+
+    comm.Scatter(state_vector, local_state_vector, root=0)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+
+    def circuit():
+        qml.StatePrep(state_vector, wires=range(num_wires))
+        operation(wires=Wires)
+        return qml.state()
+
+    cpu_qnode = qml.QNode(circuit, dev_cpu)
+    expected_output_cpu = cpu_qnode().astype(c_dtype)
+    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+    mpi_qnode = qml.QNode(circuit, dev_mpi)
+    local_state_vector = mpi_qnode()
+
+    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+
+def apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires):
+    """Wrapper applying a non-parametric gate with the apply method."""
+    num_wires = numQubits
+    comm = MPI.COMM_WORLD
+    commSize = comm.Get_size()
+    num_global_wires = commSize.bit_length() - 1
+    num_local_wires = num_wires - num_global_wires
+
+    if dev_mpi.R_DTYPE == np.float32:
+        c_dtype = np.complex64
+    else:
+        c_dtype = np.complex128
+
+    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
+    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
+    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+
+    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    comm.Bcast(state_vector, root=0)
+
+    comm.Scatter(state_vector, local_state_vector, root=0)
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+
+    @qml.qnode(dev_cpu)
+    def circuit():
+        qml.StatePrep(state_vector, wires=range(num_wires))
+        operation(wires=Wires)
+        return qml.state()
+
+    expected_output_cpu = np.array(circuit()).astype(c_dtype)
+    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+    dev_mpi.syncH2D(local_state_vector)
+    dev_mpi.apply([operation(wires=Wires)])
+    dev_mpi.syncD2H(local_state_vector)
+
+    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+
+class TestApply:  # pylint: disable=missing-function-docstring,too-many-arguments
+    """Tests whether the device can apply supported quantum gates."""
+
+    @pytest.fixture(params=fixture_params)
+    def dev_mpi(self, request):
+        return qml.device(
+            device_name,
+            wires=numQubits,
+            mpi=True,
+            c_dtype=request.param[0],
+            batch_obs=request.param[1],
+        )
+
+    # Parameterized test case for single wire nonparam gates
+    @pytest.mark.parametrize(
+        "operation", [qml.PauliX, qml.PauliY, qml.PauliZ, qml.Hadamard, qml.S, qml.T]
+    )
+    @pytest.mark.parametrize("Wires", [0, 1, numQubits - 2, numQubits - 1])
+    def test_apply_operation_single_wire_nonparam(self, tol, operation, Wires, dev_mpi):
+        apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
+        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
+
+    @pytest.mark.parametrize("operation", [qml.CNOT, qml.SWAP, qml.CY, qml.CZ])
+    @pytest.mark.parametrize("Wires", [[0, 1], [numQubits - 2, numQubits - 1], [0, numQubits - 1]])
+    def test_apply_operation_two_wire_nonparam(self, tol, operation, Wires, dev_mpi):
+        apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
+        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
+
+    @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0, 1, 2],
+            [numQubits - 3, numQubits - 2, numQubits - 1],
+            [0, 1, numQubits - 1],
+            [0, numQubits - 2, numQubits - 1],
+        ],
+    )
+    def test_apply_operation_three_wire_nonparam(self, tol, operation, Wires, dev_mpi):
+        apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
+        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
+
+    @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0, 1, 2],
+            [numQubits - 3, numQubits - 2, numQubits - 1],
+            [0, 1, numQubits - 1],
+            [0, numQubits - 2, numQubits - 1],
+        ],
+    )
+    def test_apply_operation_three_wire_qnode_nonparam(self, tol, operation, Wires, dev_mpi):
+        apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
+        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
+
+    @pytest.mark.parametrize("operation", [qml.PhaseShift, qml.RX, qml.RY, qml.RZ])
+    @pytest.mark.parametrize("par", [[0.1], [0.2], [0.3]])
+    @pytest.mark.parametrize("Wires", [0, numQubits - 1])
+    def test_apply_operation_1gatequbit_1param_gate_qnode_param(
+        self, tol, operation, par, Wires, dev_mpi
+    ):
+        apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
+        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
+
+    @pytest.mark.parametrize("operation", [qml.Rot])
+    @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
+    @pytest.mark.parametrize("Wires", [0, numQubits - 1])
+    def test_apply_operation_1gatequbit_3param_gate_qnode_param(
+        self, tol, operation, par, Wires, dev_mpi
+    ):
+        apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
+        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
+
+    @pytest.mark.parametrize("operation", [qml.CRot])
+    @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
+    @pytest.mark.parametrize("Wires", [[0, numQubits - 1], [0, 1], [numQubits - 2, numQubits - 1]])
+    def test_apply_operation_1gatequbit_3param_cgate_qnode_param(
+        self, tol, operation, par, Wires, dev_mpi
+    ):
+        apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
+        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
+
+    @pytest.mark.parametrize(
+        "operation",
+        [
+            qml.CRX,
+            qml.CRY,
+            qml.CRZ,
+            qml.ControlledPhaseShift,
+            qml.SingleExcitation,
+            qml.SingleExcitationMinus,
+            qml.SingleExcitationPlus,
+            qml.IsingXX,
+            qml.IsingYY,
+            qml.IsingZZ,
+        ],
+    )
+    @pytest.mark.parametrize("par", [[0.1], [0.2], [0.3]])
+    @pytest.mark.parametrize("Wires", [[0, numQubits - 1], [0, 1], [numQubits - 2, numQubits - 1]])
+    def test_apply_operation_2gatequbit_1param_gate_qnode_param(
+        self, tol, operation, par, Wires, dev_mpi
+    ):
+        apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
+        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
+
+    @pytest.mark.parametrize(
+        "operation",
+        [qml.DoubleExcitation, qml.DoubleExcitationMinus, qml.DoubleExcitationPlus],
+    )
+    @pytest.mark.parametrize("par", [[0.13], [0.2], [0.3]])
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0, 1, numQubits - 2, numQubits - 1],
+            [0, 1, 2, 3],
+            [numQubits - 4, numQubits - 3, numQubits - 2, numQubits - 1],
+        ],
+    )
+    def test_apply_operation_4gatequbit_1param_gate_qnode_param(
+        self, tol, operation, par, Wires, dev_mpi
+    ):
+        apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
+        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
+
+    # BasisState test
+    @pytest.mark.parametrize("operation", [qml.BasisState])
+    @pytest.mark.parametrize("index", range(numQubits))
+    def test_state_prep(self, tol, operation, index, dev_mpi):
+        par = np.zeros(numQubits, dtype=int)
+        par[index] = 1
+        num_wires = numQubits
+        comm = MPI.COMM_WORLD
+        commSize = comm.Get_size()
+        num_global_wires = commSize.bit_length() - 1
+        num_local_wires = num_wires - num_global_wires
+
+        if dev_mpi.R_DTYPE == np.float32:
+            c_dtype = np.complex64
+        else:
+            c_dtype = np.complex128
+
+        state_vector = np.zeros(1 << num_wires).astype(c_dtype)
+        expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
+        local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
+        local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+
+        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+
+        comm.Scatter(state_vector, local_state_vector, root=0)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+
+        def circuit():
+            operation(par, wires=range(numQubits))
+            return qml.state()
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+
+        expected_output_cpu = cpu_qnode().astype(c_dtype)
+        comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+        local_state_vector = mpi_qnode()
+
+        assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize(
+        "par, Wires",
+        [
+            (np.array([1 / np.sqrt(2), 1 / np.sqrt(2)]), [0]),
+            (np.array([1 / np.sqrt(2), 1 / np.sqrt(2)]), [1]),
+            (np.array([1 / np.sqrt(2), 1 / np.sqrt(2)]), [2]),
+            (np.array([1 / np.sqrt(2), 1 / np.sqrt(2)]), [3]),
+            (np.array([1 / np.sqrt(2), 1 / np.sqrt(2)]), [4]),
+            (np.array([1 / np.sqrt(2), 1 / np.sqrt(2)]), [5]),
+            (np.array([0, 1 / np.sqrt(2), 0, 1 / np.sqrt(2)]), [1, 0]),
+            (np.array([0, 1 / np.sqrt(2), 0, 1 / np.sqrt(2)]), [0, 1]),
+            (np.array([0, 1 / np.sqrt(2), 0, 1 / np.sqrt(2)]), [0, 2]),
+            (
+                np.array([0, 1 / np.sqrt(2), 0, 1 / np.sqrt(2)]),
+                [numQubits - 2, numQubits - 1],
+            ),
+            (
+                np.array([0, 1 / np.sqrt(2), 0, 1 / np.sqrt(2)]),
+                [0, numQubits - 1],
+            ),
+            (
+                np.array([0, 1 / np.sqrt(2), 0, 1 / np.sqrt(2)]),
+                [0, numQubits - 2],
+            ),
+        ],
+    )
+    def test_qubit_state_prep(self, tol, par, Wires, dev_mpi):
+        num_wires = numQubits
+        comm = MPI.COMM_WORLD
+        commSize = comm.Get_size()
+        num_global_wires = commSize.bit_length() - 1
+        num_local_wires = num_wires - num_global_wires
+
+        if dev_mpi.R_DTYPE == np.float32:
+            c_dtype = np.complex64
+        else:
+            c_dtype = np.complex128
+
+        state_vector = np.zeros(1 << num_wires).astype(c_dtype)
+        expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
+        local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
+        local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+
+        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+
+        comm.Scatter(state_vector, local_state_vector, root=0)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+
+        def circuit():
+            qml.StatePrep(par, wires=Wires)
+            return qml.state()
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+
+        expected_output_cpu = cpu_qnode().astype(c_dtype)
+        comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+        local_state_vector = mpi_qnode()
+
+        assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+    def test_dev_reset(self, tol, dev_mpi):
+        num_wires = numQubits
+        comm = MPI.COMM_WORLD
+        commSize = comm.Get_size()
+        num_global_wires = commSize.bit_length() - 1
+        num_local_wires = num_wires - num_global_wires
+
+        if dev_mpi.R_DTYPE == np.float32:
+            c_dtype = np.complex64
+        else:
+            c_dtype = np.complex128
+
+        state_vector = np.zeros(1 << num_wires).astype(c_dtype)
+        expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
+        local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
+        local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
+
+        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+
+        comm.Scatter(state_vector, local_state_vector, root=0)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
+
+        dev_cpu.reset()
+
+        def circuit():
+            qml.PauliX(wires=[0])
+            qml.PauliX(wires=[0])
+            return qml.state()
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+
+        expected_output_cpu = cpu_qnode().astype(c_dtype)
+        comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
+
+        dev_mpi.reset()
+
+        gpumpi_qnode = qml.QNode(circuit, dev_mpi)
+        dev_mpi.reset()
+
+        local_state_vector = gpumpi_qnode()
+        assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
+
+
+class TestSparseHamExpval:  # pylint: disable=too-few-public-methods,missing-function-docstring
+    """Tests sparse hamiltonian expectation values."""
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    def test_sparse_hamiltonian_expectation(self, C_DTYPE):
+        comm = MPI.COMM_WORLD
+        commSize = comm.Get_size()
+        num_global_wires = commSize.bit_length() - 1
+        num_local_wires = 3 - num_global_wires
+
+        obs = qml.Identity(0) @ qml.PauliX(1) @ qml.PauliY(2)
+        obs1 = qml.Identity(1)
+        Hmat = qml.Hamiltonian([1.0, 1.0], [obs1, obs]).sparse_matrix()
+
+        state_vector = np.array(
+            [
+                0.0 + 0.0j,
+                0.0 + 0.1j,
+                0.1 + 0.1j,
+                0.1 + 0.2j,
+                0.2 + 0.2j,
+                0.2 + 0.3j,
+                0.3 + 0.3j,
+                0.3 + 0.5j,
+            ],
+            dtype=C_DTYPE,
+        )
+
+        local_state_vector = np.zeros(1 << num_local_wires).astype(C_DTYPE)
+        comm.Scatter(state_vector, local_state_vector, root=0)
+
+        dev_gpu = qml.device("lightning.gpu", wires=3, mpi=False, c_dtype=C_DTYPE)
+        dev_mpi = qml.device("lightning.gpu", wires=3, mpi=True, c_dtype=C_DTYPE)
+
+        dev_mpi.syncH2D(local_state_vector)
+        dev_gpu.syncH2D(state_vector)
+
+        H_sparse = qml.SparseHamiltonian(Hmat, wires=range(3))
+
+        comm.Barrier()
+
+        res = dev_mpi.expval(H_sparse)
+        expected = dev_gpu.expval(H_sparse)
+
+        assert np.allclose(res, expected)
+
+
+class TestExpval:
+    """Tests that expectation values are properly calculated or that the proper errors are raised."""
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    @pytest.mark.parametrize(
+        "operation",
+        [
+            qml.PauliX,
+            qml.PauliY,
+            qml.PauliZ,
+            qml.Hadamard,
+            qml.Identity,
+        ],
+    )
+    @pytest.mark.parametrize("wires", [0, 1, 2, numQubits - 3, numQubits - 2, numQubits - 1])
+    def test_expval_single_wire_no_parameters(self, tol, operation, wires, C_DTYPE):
+        """Tests that expectation values are properly calculated for single-wire observables without parameters."""
+        num_wires = numQubits
+        comm = MPI.COMM_WORLD
+        commSize = comm.Get_size()
+        num_global_wires = commSize.bit_length() - 1
+        num_local_wires = num_wires - num_global_wires
+
+        dev_mpi = qml.device("lightning.gpu", wires=numQubits, mpi=True, c_dtype=C_DTYPE)
+
+        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        comm.Bcast(state_vector, root=0)
+
+        local_state_vector = np.zeros(1 << num_local_wires).astype(C_DTYPE)
+        comm.Scatter(state_vector, local_state_vector, root=0)
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
+
+        def circuit():
+            qml.StatePrep(state_vector, wires=range(num_wires))
+            return qml.expval(operation(wires))
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        expected_output_cpu = cpu_qnode()
+        comm.Bcast(expected_output_cpu, root=0)
+
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+        expected_output_mpi = mpi_qnode()
+
+        assert np.allclose(expected_output_mpi, expected_output_cpu, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    @pytest.mark.parametrize(
+        "obs",
+        [
+            qml.PauliX(0) @ qml.PauliZ(1),
+            qml.PauliX(0) @ qml.PauliZ(numQubits - 1),
+            qml.PauliX(numQubits - 2) @ qml.PauliZ(numQubits - 1),
+            qml.PauliZ(0) @ qml.PauliZ(1),
+            qml.PauliZ(0) @ qml.PauliZ(numQubits - 1),
+            qml.PauliZ(numQubits - 2) @ qml.PauliZ(numQubits - 1),
+        ],
+    )
+    def test_expval_multiple_obs(self, obs, tol, C_DTYPE):
+        """Test expval with Hamiltonian"""
+        num_wires = numQubits
+
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
+        dev_mpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+
+        def circuit():
+            qml.RX(0.4, wires=[0])
+            qml.RY(-0.2, wires=[num_wires - 1])
+            return qml.expval(obs)
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+
+        assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    @pytest.mark.parametrize(
+        "obs, coeffs",
+        [
+            ([qml.PauliX(0) @ qml.PauliZ(1)], [0.314]),
+            ([qml.PauliX(0) @ qml.PauliZ(numQubits - 1)], [0.314]),
+            ([qml.PauliZ(0) @ qml.PauliZ(1)], [0.314]),
+            ([qml.PauliZ(0) @ qml.PauliZ(numQubits - 1)], [0.314]),
+            (
+                [qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)],
+                [0.314, 0.2],
+            ),
+            (
+                [
+                    qml.PauliX(0) @ qml.PauliZ(numQubits - 1),
+                    qml.PauliZ(0) @ qml.PauliZ(1),
+                ],
+                [0.314, 0.2],
+            ),
+            (
+                [
+                    qml.PauliX(numQubits - 2) @ qml.PauliZ(numQubits - 1),
+                    qml.PauliZ(0) @ qml.PauliZ(1),
+                ],
+                [0.314, 0.2],
+            ),
+        ],
+    )
+    def test_expval_hamiltonian(self, obs, coeffs, tol, C_DTYPE):
+        """Test expval with Hamiltonian"""
+        num_wires = numQubits
+
+        ham = qml.Hamiltonian(coeffs, obs)
+
+        dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
+        dev_mpi = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+
+        def circuit():
+            qml.RX(0.4, wires=[0])
+            qml.RY(-0.2, wires=[numQubits - 1])
+            return qml.expval(ham)
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+
+        assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
+
+    def test_expval_non_pauli_word_hamiltionian(self, tol):
+        """Tests expectation values of non-Pauli word Hamiltonians."""
+        dev_mpi = qml.device("lightning.gpu", wires=3, mpi=True)
+        dev_cpu = qml.device("lightning.qubit", wires=3)
+
+        theta = 0.432
+        phi = 0.123
+        varphi = -0.543
+
+        def circuit():
+            qml.RX(theta, wires=[0])
+            qml.RX(phi, wires=[1])
+            qml.RX(varphi, wires=[2])
+            qml.CNOT(wires=[0, 1])
+            qml.CNOT(wires=[1, 2])
+            return qml.expval(0.5 * qml.Hadamard(2))
+
+        cpu_qnode = qml.QNode(circuit, dev_cpu)
+        mpi_qnode = qml.QNode(circuit, dev_mpi)
+
+        assert np.allclose(cpu_qnode(), mpi_qnode(), atol=tol, rtol=0)
+
+
+class TestGenerateSample:
+    """Tests that samples are properly calculated."""
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    def test_sample_dimensions(self, C_DTYPE):
+        """Tests if the samples returned by sample have
+        the correct dimensions
+        """
+        num_wires = numQubits
+
+        dev = qml.device("lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE)
+
+        dev.apply([qml.RX(1.5708, wires=[0]), qml.RX(1.5708, wires=[1])])
+
+        dev.shots = 10
+        dev._wires_measured = {0}
+        dev._samples = dev.generate_samples()
+        s1 = dev.sample(qml.PauliZ(wires=[0]))
+        assert np.array_equal(s1.shape, (10,))
+
+        dev.reset()
+        dev.shots = 12
+        dev._wires_measured = {1}
+        dev._samples = dev.generate_samples()
+        s2 = dev.sample(qml.PauliZ(wires=[1]))
+        assert np.array_equal(s2.shape, (12,))
+
+        dev.reset()
+        dev.shots = 17
+        dev._wires_measured = {0, 1}
+        dev._samples = dev.generate_samples()
+        s3 = dev.sample(qml.PauliX(0) @ qml.PauliZ(1))
+        assert np.array_equal(s3.shape, (17,))
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    def test_sample_values(self, tol, C_DTYPE):
+        """Tests if the samples returned by sample have
+        the correct values
+        """
+        num_wires = numQubits
+
+        dev = qml.device("lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE)
+        dev.reset()
+        dev.apply([qml.RX(1.5708, wires=[0])])
+        dev._wires_measured = {0}
+        dev._samples = dev.generate_samples()
+
+        s1 = dev.sample(qml.PauliZ(0))
+
+        # s1 should only contain 1 and -1, which is guaranteed if
+        # they square to 1
+        assert np.allclose(s1**2, 1, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    def test_sample_values_qnode(self, tol, C_DTYPE):
+        """Tests if the samples returned by sample have
+        the correct values
+        """
+        num_wires = numQubits
+
+        dev_mpi = qml.device(
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+        )
+        dev_mpi.reset()
+
+        @qml.qnode(dev_mpi)
+        def circuit():
+            qml.RX(1.5708, wires=0)
+            return qml.sample(qml.PauliZ(0))
+
+        # s1 should only contain 1 and -1, which is guaranteed if
+        # they square to 1
+        assert np.allclose(circuit() ** 2, 1, atol=tol, rtol=0)
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    def test_multi_samples_return_correlated_results(self, C_DTYPE):
+        """Tests if the samples returned by the sample function have
+        the correct dimensions
+        """
+        num_wires = 3
+
+        dev_gpumpi = qml.device(
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+        )
+
+        @qml.qnode(dev_gpumpi)
+        def circuit():
+            qml.Hadamard(0)
+            qml.CNOT(wires=[0, 1])
+            return qml.sample(qml.PauliZ(0)), qml.sample(qml.PauliZ(1))
+
+        outcomes = circuit()
+
+        assert np.array_equal(outcomes[0], outcomes[1])
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    def test_paulix_pauliy(self, C_DTYPE, tol=TOL_STOCHASTIC):
+        """Test that a tensor product involving PauliX and PauliY works correctly"""
+        num_wires = 3
+
+        dev_gpumpi = qml.device(
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+        )
+
+        theta = 0.432
+        phi = 0.123
+        varphi = -0.543
+
+        @qml.qnode(dev_gpumpi)
+        def circuit():
+            qml.RX(theta, wires=[0])
+            qml.RX(phi, wires=[1])
+            qml.RX(varphi, wires=[2])
+            qml.CNOT(wires=[0, 1])
+            qml.CNOT(wires=[1, 2])
+            return qml.sample(qml.PauliX(wires=[0]) @ qml.PauliY(wires=[2]))
+
+        res = circuit()
+
+        # res should only contain 1 and -1
+        assert np.allclose(res**2, 1, atol=tol)
+
+        mean = np.mean(res)
+        expected = np.sin(theta) * np.sin(phi) * np.sin(varphi)
+        assert np.allclose(mean, expected, atol=tol)
+
+        var = np.var(res)
+        expected = (
+            8 * np.sin(theta) ** 2 * np.cos(2 * varphi) * np.sin(phi) ** 2
+            - np.cos(2 * (theta - phi))
+            - np.cos(2 * (theta + phi))
+            + 2 * np.cos(2 * theta)
+            + 2 * np.cos(2 * phi)
+            + 14
+        ) / 16
+        assert np.allclose(var, expected, atol=tol)
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    def test_pauliz_hadamard(self, C_DTYPE, tol=TOL_STOCHASTIC):
+        """Test that a tensor product involving PauliZ and PauliY and hadamard works correctly"""
+        num_wires = 3
+
+        dev_gpumpi = qml.device(
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+        )
+
+        theta = 0.432
+        phi = 0.123
+        varphi = -0.543
+
+        @qml.qnode(dev_gpumpi)
+        def circuit():
+            qml.RX(theta, wires=[0])
+            qml.RX(phi, wires=[1])
+            qml.RX(varphi, wires=[2])
+            qml.CNOT(wires=[0, 1])
+            qml.CNOT(wires=[1, 2])
+            return qml.sample(
+                qml.PauliZ(wires=[0]) @ qml.Hadamard(wires=[1]) @ qml.PauliY(wires=[2])
+            )
+
+        res = circuit()
+
+        # s1 should only contain 1 and -1
+        assert np.allclose(res**2, 1, atol=tol)
+
+        mean = np.mean(res)
+        expected = -(np.cos(varphi) * np.sin(phi) + np.sin(varphi) * np.cos(theta)) / np.sqrt(2)
+        assert np.allclose(mean, expected, atol=tol)
+
+        var = np.var(res)
+        expected = (
+            3
+            + np.cos(2 * phi) * np.cos(varphi) ** 2
+            - np.cos(2 * theta) * np.sin(varphi) ** 2
+            - 2 * np.cos(theta) * np.sin(phi) * np.sin(2 * varphi)
+        ) / 4
+        assert np.allclose(var, expected, atol=tol)
+
+
+class TestTensorVar:
+    """Test tensor variance measurements."""
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    def test_paulix_pauliy(self, C_DTYPE, tol=TOL_STOCHASTIC):
+        """Test that a tensor product involving PauliX and PauliY works correctly"""
+        num_wires = 3
+
+        dev_gpumpi = qml.device(
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+        )
+
+        theta = 0.432
+        phi = 0.123
+        varphi = -0.543
+
+        @qml.qnode(dev_gpumpi)
+        def circuit():
+            qml.RX(theta, wires=[0])
+            qml.RX(phi, wires=[1])
+            qml.RX(varphi, wires=[2])
+            qml.CNOT(wires=[0, 1])
+            qml.CNOT(wires=[1, 2])
+            return qml.var(qml.PauliX(wires=[0]) @ qml.PauliY(wires=[2]))
+
+        res = circuit()
+
+        expected = (
+            8 * np.sin(theta) ** 2 * np.cos(2 * varphi) * np.sin(phi) ** 2
+            - np.cos(2 * (theta - phi))
+            - np.cos(2 * (theta + phi))
+            + 2 * np.cos(2 * theta)
+            + 2 * np.cos(2 * phi)
+            + 14
+        ) / 16
+        assert np.allclose(res, expected, atol=tol)
+
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128, np.complex64])
+    def test_pauliz_hadamard(self, C_DTYPE, tol=TOL_STOCHASTIC):
+        """Test that a tensor product involving PauliZ and PauliY and hadamard works correctly"""
+        num_wires = 3
+        dev_gpumpi = qml.device(
+            "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
+        )
+
+        theta = 0.432
+        phi = 0.123
+        varphi = -0.543
+
+        @qml.qnode(dev_gpumpi)
+        def circuit():
+            qml.RX(theta, wires=[0])
+            qml.RX(phi, wires=[1])
+            qml.RX(varphi, wires=[2])
+            qml.CNOT(wires=[0, 1])
+            qml.CNOT(wires=[1, 2])
+            return qml.var(qml.PauliZ(wires=[0]) @ qml.Hadamard(wires=[1]) @ qml.PauliY(wires=[2]))
+
+        res = circuit()
+
+        expected = (
+            3
+            + np.cos(2 * phi) * np.cos(varphi) ** 2
+            - np.cos(2 * theta) * np.sin(varphi) ** 2
+            - 2 * np.cos(theta) * np.sin(phi) * np.sin(2 * varphi)
+        ) / 4
+        assert np.allclose(res, expected, atol=tol)
+
+
+def circuit_ansatz(params, wires):
+    """Circuit ansatz containing all the parametrized gates"""
+    # pylint: disable=undefined-variable
+    qml.StatePrep(
+        unitary_group.rvs(2**numQubits, random_state=0)[0],
+        wires=wires,
+    )
+    qml.RX(params[0], wires=wires[0])
+    qml.RY(params[1], wires=wires[1])
+    qml.adjoint(qml.RX(params[2], wires=wires[2]))
+    qml.RZ(params[0], wires=wires[3])
+    qml.CRX(params[3], wires=[wires[3], wires[0]])
+    qml.PhaseShift(params[4], wires=wires[2])
+    qml.CRY(params[5], wires=[wires[2], wires[1]])
+    qml.adjoint(qml.CRZ(params[5], wires=[wires[0], wires[3]]))
+    qml.adjoint(qml.PhaseShift(params[6], wires=wires[0]))
+    qml.Rot(params[6], params[7], params[8], wires=wires[0])
+    qml.adjoint(qml.Rot(params[8], params[8], params[9], wires=wires[1]))
+    qml.MultiRZ(params[11], wires=[wires[0], wires[1]])
+    qml.CPhase(params[12], wires=[wires[3], wires[2]])
+    qml.IsingXX(params[13], wires=[wires[1], wires[0]])
+    qml.IsingYY(params[14], wires=[wires[3], wires[2]])
+    qml.IsingZZ(params[15], wires=[wires[2], wires[1]])
+    qml.SingleExcitation(params[24], wires=[wires[2], wires[0]])
+    qml.DoubleExcitation(params[25], wires=[wires[2], wires[0], wires[1], wires[3]])
+
+
+@pytest.mark.parametrize(
+    "returns",
+    [
+        (qml.PauliX(0),),
+        (qml.PauliY(0),),
+        (qml.PauliZ(0),),
+        (qml.PauliX(1),),
+        (qml.PauliY(1),),
+        (qml.PauliZ(1),),
+        (qml.PauliX(2),),
+        (qml.PauliY(2),),
+        (qml.PauliZ(2),),
+        (qml.PauliX(3),),
+        (qml.PauliY(3),),
+        (qml.PauliZ(3),),
+        (qml.PauliX(0), qml.PauliY(1)),
+        (
+            qml.PauliZ(0),
+            qml.PauliX(1),
+            qml.PauliY(2),
+        ),
+        (
+            qml.PauliY(0),
+            qml.PauliZ(1),
+            qml.PauliY(3),
+        ),
+        (qml.PauliZ(0) @ qml.PauliY(3),),
+        (qml.Hadamard(2),),
+        (qml.Hadamard(3) @ qml.PauliZ(2),),
+        (qml.PauliX(0) @ qml.PauliY(3),),
+        (qml.PauliY(0) @ qml.PauliY(2) @ qml.PauliY(3),),
+        (qml.PauliZ(0) @ qml.PauliZ(1) @ qml.PauliZ(2),),
+        (0.5 * qml.PauliZ(0) @ qml.PauliZ(2),),
+    ],
+)
+def test_integration(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations"""
+    num_wires = numQubits
+    dev_default = qml.device("lightning.qubit", wires=range(num_wires))
+    dev_gpu = qml.device("lightning.gpu", wires=num_wires, mpi=True, c_dtype=np.complex128)
+
+    def circuit(params):
+        circuit_ansatz(params, wires=range(num_wires))
+        return qml.math.hstack([qml.expval(r) for r in returns])
+
+    n_params = 30
+    np.random.seed(1337)
+    params = np.random.rand(n_params)
+
+    qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="parameter-shift")
+    qnode_default = qml.QNode(circuit, dev_default, diff_method="parameter-shift")
+
+    def convert_to_array_gpu(params):
+        return np.array(qnode_gpu(params))
+
+    def convert_to_array_default(params):
+        return np.array(qnode_default(params))
+
+    j_gpu = qml.jacobian(convert_to_array_gpu)(params)
+    j_default = qml.jacobian(convert_to_array_default)(params)
+
+    assert np.allclose(j_gpu, j_default, atol=1e-7)
+
+
+custom_wires = ["alice", 3.14, -1, 0, "bob", "l", "m", "n"]
+
+
+@pytest.mark.parametrize(
+    "returns",
+    [
+        qml.PauliZ(custom_wires[0]),
+        qml.PauliX(custom_wires[2]),
+        qml.PauliZ(custom_wires[0]) @ qml.PauliY(custom_wires[3]),
+        qml.Hadamard(custom_wires[2]),
+        qml.Hadamard(custom_wires[3]) @ qml.PauliZ(custom_wires[2]),
+        qml.PauliX(custom_wires[0]) @ qml.PauliY(custom_wires[3]),
+        qml.PauliY(custom_wires[0]) @ qml.PauliY(custom_wires[2]) @ qml.PauliY(custom_wires[3]),
+    ],
+)
+def test_integration_custom_wires(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations and when using custom wire labels"""
+    dev_lightning = qml.device("lightning.qubit", wires=custom_wires)
+    dev_gpu = qml.device("lightning.gpu", wires=custom_wires, mpi=True, c_dtype=np.complex128)
+
+    def circuit(params):
+        circuit_ansatz(params, wires=custom_wires)
+        return qml.expval(returns), qml.expval(qml.PauliY(custom_wires[1]))
+
+    n_params = 30
+    np.random.seed(1337)
+    params = np.random.rand(n_params)
+
+    qnode_gpu = qml.QNode(circuit, dev_gpu, diff_method="parameter-shift")
+    qnode_lightning = qml.QNode(circuit, dev_lightning, diff_method="parameter-shift")
+
+    def convert_to_array_gpu(params):
+        return np.array(qnode_gpu(params))
+
+    def convert_to_array_lightning(params):
+        return np.array(qnode_lightning(params))
+
+    j_gpu = qml.jacobian(convert_to_array_gpu)(params)
+    j_lightning = qml.jacobian(convert_to_array_lightning)(params)
+
+    assert np.allclose(j_gpu, j_lightning, atol=1e-7)
diff --git a/mpitests/test_device.py b/mpitests/test_device.py
new file mode 100644
index 0000000000..d9761bf148
--- /dev/null
+++ b/mpitests/test_device.py
@@ -0,0 +1,54 @@
+# Copyright 2018-2023 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for Lightning devices creation.
+"""
+# pylint: disable=protected-access,unused-variable,missing-function-docstring,c-extension-no-member
+
+import pytest
+from conftest import device_name, LightningDevice as ld
+
+import pennylane as qml
+from mpi4py import MPI
+
+if not ld._CPP_BINARY_AVAILABLE:
+    pytest.skip("No binary module found. Skipping.", allow_module_level=True)
+
+
+def test_create_device():
+    if MPI.COMM_WORLD.Get_size() > 2:
+        with pytest.raises(
+            ValueError,
+            match="Number of devices should be larger than or equal to the number of processes on each node.",
+        ):
+            dev = qml.device(device_name, mpi=True, wires=4)
+    else:
+        dev = qml.device(device_name, mpi=True, wires=4)
+
+
+def test_unsupported_mpi_buf_size():
+    with pytest.raises(TypeError, match="Unsupported mpi_buf_size value"):
+        dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=-1)
+    with pytest.raises(TypeError, match="Unsupported mpi_buf_size value"):
+        dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=3)
+    with pytest.warns(
+        RuntimeWarning,
+        match="The MPI buffer size is larger than the local state vector size",
+    ):
+        dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=2**4)
+    with pytest.raises(
+        ValueError,
+        match="Number of processes should be smaller than the number of statevector elements",
+    ):
+        dev = qml.device(device_name, mpi=True, wires=1)
diff --git a/mpitests/test_expval.py b/mpitests/test_expval.py
new file mode 100644
index 0000000000..ad76da1aa5
--- /dev/null
+++ b/mpitests/test_expval.py
@@ -0,0 +1,332 @@
+# Copyright 2018-2023 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for the expval method of Lightning devices.
+"""
+# pylint: disable=protected-access,too-few-public-methods,unused-import,missing-function-docstring,too-many-arguments,c-extension-no-member
+
+import pytest
+from conftest import THETA, PHI, VARPHI, device_name
+
+import numpy as np
+import pennylane as qml
+from mpi4py import MPI
+
+
+@pytest.mark.parametrize("theta, phi", list(zip(THETA, PHI)))
+class TestExpval:
+    """Test expectation values"""
+
+    def test_identity_expectation(self, theta, phi, tol):
+        """Test that identity expectation value (i.e. the trace) is 1"""
+        dev = qml.device(device_name, mpi=True, wires=3)
+        if device_name == "lightning.gpu" and dev.R_DTYPE == np.float32:
+            pytest.skip("Skipped FP32 tests for expval in lightning.gpu")
+
+        O1 = qml.Identity(wires=[0])
+        O2 = qml.Identity(wires=[1])
+
+        dev.apply(
+            [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
+            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
+        )
+
+        res = np.array([dev.expval(O1), dev.expval(O2)])
+        assert np.allclose(res, np.array([1, 1]), tol)
+
+    def test_pauliz_expectation(self, theta, phi, tol):
+        """Test that PauliZ expectation value is correct"""
+        dev = qml.device(device_name, mpi=True, wires=3)
+
+        if device_name == "lightning.gpu" and dev.R_DTYPE == np.float32:
+            pytest.skip("Skipped FP32 tests for expval in lightning.gpu")
+
+        O1 = qml.PauliZ(wires=[0])
+        O2 = qml.PauliZ(wires=[1])
+
+        dev.apply(
+            [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
+            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
+        )
+
+        res = np.array([dev.expval(O1), dev.expval(O2)])
+        assert np.allclose(res, np.array([np.cos(theta), np.cos(theta) * np.cos(phi)]), tol)
+
+    def test_paulix_expectation(self, theta, phi, tol):
+        """Test that PauliX expectation value is correct"""
+        dev = qml.device(device_name, mpi=True, wires=3)
+
+        if device_name == "lightning.gpu" and dev.R_DTYPE == np.float32:
+            pytest.skip("Skipped FP32 tests for expval in lightning.gpu")
+
+        O1 = qml.PauliX(wires=[0])
+        O2 = qml.PauliX(wires=[1])
+
+        dev.apply(
+            [qml.RY(theta, wires=[0]), qml.RY(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
+            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
+        )
+
+        res = np.array([dev.expval(O1), dev.expval(O2)], dtype=dev.C_DTYPE)
+        assert np.allclose(
+            res,
+            np.array([np.sin(theta) * np.sin(phi), np.sin(phi)], dtype=dev.C_DTYPE),
+            tol * 10,
+        )
+
+    def test_pauliy_expectation(self, theta, phi, tol):
+        """Test that PauliY expectation value is correct"""
+        dev = qml.device(device_name, mpi=True, wires=3)
+
+        if device_name == "lightning.gpu" and dev.R_DTYPE == np.float32:
+            pytest.skip("Skipped FP32 tests for expval in lightning.gpu")
+
+        O1 = qml.PauliY(wires=[0])
+        O2 = qml.PauliY(wires=[1])
+
+        dev.apply(
+            [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
+            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
+        )
+
+        res = np.array([dev.expval(O1), dev.expval(O2)])
+        assert np.allclose(res, np.array([0, -np.cos(theta) * np.sin(phi)]), tol)
+
+    def test_hadamard_expectation(self, theta, phi, tol):
+        """Test that Hadamard expectation value is correct"""
+        dev = qml.device(device_name, mpi=True, wires=3)
+
+        O1 = qml.Hadamard(wires=[0])
+        O2 = qml.Hadamard(wires=[1])
+
+        dev.apply(
+            [qml.RY(theta, wires=[0]), qml.RY(phi, wires=[1]), qml.CNOT(wires=[0, 1])],
+            rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
+        )
+
+        res = np.array([dev.expval(O1), dev.expval(O2)])
+        expected = np.array(
+            [
+                np.sin(theta) * np.sin(phi) + np.cos(theta),
+                np.cos(theta) * np.cos(phi) + np.sin(phi),
+            ]
+        ) / np.sqrt(2)
+        assert np.allclose(res, expected, tol)
+
+    @pytest.mark.parametrize("n_wires", range(1, 8))
+    def test_hermitian_expectation(self, n_wires, theta, phi, tol):
+        """Test that Hadamard expectation value is correct"""
+        n_qubits = 7
+        dev_def = qml.device("default.qubit", wires=n_qubits)
+        dev = qml.device(device_name, mpi=True, wires=n_qubits)
+        if device_name == "lightning.gpu" and dev.R_DTYPE == np.float32:
+            pytest.skip("Skipped FP32 tests for expval in lightning.gpu")
+        comm = MPI.COMM_WORLD
+
+        m = 2**n_wires
+        U = np.random.rand(m, m) + 1j * np.random.rand(m, m)
+        U = U + np.conj(U.T)
+        U = U.astype(dev.C_DTYPE)
+        comm.Bcast(U, root=0)
+        obs = qml.Hermitian(U, wires=range(n_wires))
+
+        init_state = np.random.rand(2**n_qubits) + 1j * np.random.rand(2**n_qubits)
+        init_state /= np.sqrt(np.dot(np.conj(init_state), init_state))
+        init_state = init_state.astype(dev.C_DTYPE)
+        comm.Bcast(init_state, root=0)
+
+        def circuit():
+            qml.StatePrep(init_state, wires=range(n_qubits))
+            qml.RY(theta, wires=[0])
+            qml.RY(phi, wires=[1])
+            qml.CNOT(wires=[0, 1])
+            return qml.expval(obs)
+
+        circ = qml.QNode(circuit, dev)
+        comm = MPI.COMM_WORLD
+        mpisize = comm.Get_size()
+        if n_wires > n_qubits - np.log2(mpisize):
+            with pytest.raises(
+                RuntimeError,
+                match="MPI backend does not support Hermitian with number of target wires larger than local wire number",
+            ):
+                circ()
+        else:
+            circ_def = qml.QNode(circuit, dev_def)
+            assert np.allclose(circ(), circ_def(), tol)
+
+
+@pytest.mark.parametrize("diff_method", ("parameter-shift", "adjoint"))
+class TestExpOperatorArithmetic:
+    """Test integration of lightning with SProd, Prod, and Sum."""
+
+    def test_sprod(self, diff_method):
+        """Test the `SProd` class with lightning qubit."""
+
+        dev = qml.device(device_name, mpi=True, wires=2)
+
+        @qml.qnode(dev, diff_method=diff_method)
+        def circuit(x):
+            qml.RX(x, wires=0)
+            return qml.expval(qml.s_prod(0.5, qml.PauliZ(0)))
+
+        x = qml.numpy.array(0.123, requires_grad=True)
+        res = circuit(x)
+        assert qml.math.allclose(res, 0.5 * np.cos(x))
+
+        g = qml.grad(circuit)(x)
+        expected_grad = -0.5 * np.sin(x)
+        assert qml.math.allclose(g, expected_grad)
+
+    def test_prod(self, diff_method):
+        """Test the `Prod` class with lightning qubit."""
+
+        dev = qml.device(device_name, mpi=True, wires=2)
+
+        @qml.qnode(dev, diff_method=diff_method)
+        def circuit(x):
+            qml.RX(x, wires=0)
+            qml.Hadamard(1)
+            qml.PauliZ(1)
+            return qml.expval(qml.prod(qml.PauliZ(0), qml.PauliX(1)))
+
+        x = qml.numpy.array(0.123, requires_grad=True)
+        res = circuit(x)
+        assert qml.math.allclose(res, -np.cos(x))
+
+        g = qml.grad(circuit)(x)
+        expected_grad = np.sin(x)
+        assert qml.math.allclose(g, expected_grad)
+
+    def test_sum(self, diff_method):
+        """Test the `Sum` class with Lightning."""
+
+        dev = qml.device(device_name, mpi=True, wires=2)
+
+        @qml.qnode(dev, diff_method=diff_method)
+        def circuit(x, y):
+            qml.RX(x, wires=0)
+            qml.RY(y, wires=1)
+            return qml.expval(qml.sum(qml.PauliZ(0), qml.PauliX(1)))
+
+        x = qml.numpy.array(-3.21, requires_grad=True)
+        y = qml.numpy.array(2.34, requires_grad=True)
+        res = circuit(x, y)
+        assert qml.math.allclose(res, np.cos(x) + np.sin(y))
+
+        g = qml.grad(circuit)(x, y)
+        expected = (-np.sin(x), np.cos(y))
+        assert qml.math.allclose(g, expected)
+
+    def test_integration(self, diff_method):
+        """Test a Combination of `Sum`, `SProd`, and `Prod`."""
+
+        obs = qml.sum(
+            qml.s_prod(2.3, qml.PauliZ(0)),
+            -0.5 * qml.prod(qml.PauliY(0), qml.PauliZ(1)),
+        )
+
+        dev = qml.device(device_name, mpi=True, wires=2)
+
+        @qml.qnode(dev, diff_method=diff_method)
+        def circuit(x, y):
+            qml.RX(x, wires=0)
+            qml.RY(y, wires=1)
+            return qml.expval(obs)
+
+        x = qml.numpy.array(0.654, requires_grad=True)
+        y = qml.numpy.array(-0.634, requires_grad=True)
+
+        res = circuit(x, y)
+        expected = 2.3 * np.cos(x) + 0.5 * np.sin(x) * np.cos(y)
+        assert qml.math.allclose(res, expected)
+
+        g = qml.grad(circuit)(x, y)
+        expected = (
+            -2.3 * np.sin(x) + 0.5 * np.cos(y) * np.cos(x),
+            -0.5 * np.sin(x) * np.sin(y),
+        )
+        assert qml.math.allclose(g, expected)
+
+
+@pytest.mark.parametrize("theta,phi,varphi", list(zip(THETA, PHI, VARPHI)))
+class TestTensorExpval:
+    """Test tensor expectation values"""
+
+    def test_paulix_pauliy(self, theta, phi, varphi, tol):
+        """Test that a tensor product involving PauliX and PauliY works
+        correctly"""
+        dev = qml.device(device_name, mpi=True, wires=3)
+        obs = qml.PauliX(0) @ qml.PauliY(2)
+
+        dev.apply(
+            [
+                qml.RX(theta, wires=[0]),
+                qml.RX(phi, wires=[1]),
+                qml.RX(varphi, wires=[2]),
+                qml.CNOT(wires=[0, 1]),
+                qml.CNOT(wires=[1, 2]),
+            ],
+            rotations=obs.diagonalizing_gates(),
+        )
+        res = dev.expval(obs)
+
+        expected = np.sin(theta) * np.sin(phi) * np.sin(varphi)
+
+        assert np.allclose(res, expected, atol=tol)
+
+    def test_pauliz_identity(self, theta, phi, varphi, tol):
+        """Test that a tensor product involving PauliZ and Identity works
+        correctly"""
+        dev = qml.device(device_name, mpi=True, wires=3)
+        obs = qml.PauliZ(0) @ qml.Identity(1) @ qml.PauliZ(2)
+
+        dev.apply(
+            [
+                qml.RX(theta, wires=[0]),
+                qml.RX(phi, wires=[1]),
+                qml.RX(varphi, wires=[2]),
+                qml.CNOT(wires=[0, 1]),
+                qml.CNOT(wires=[1, 2]),
+            ],
+            rotations=obs.diagonalizing_gates(),
+        )
+
+        res = dev.expval(obs)
+
+        expected = np.cos(varphi) * np.cos(phi)
+
+        assert np.allclose(res, expected, tol)
+
+    def test_pauliz_hadamard_pauliy(self, theta, phi, varphi, tol):
+        """Test that a tensor product involving PauliZ and PauliY and Hadamard
+        works correctly"""
+        dev = qml.device(device_name, mpi=True, wires=3)
+        obs = qml.PauliZ(0) @ qml.Hadamard(1) @ qml.PauliY(2)
+
+        dev.apply(
+            [
+                qml.RX(theta, wires=[0]),
+                qml.RX(phi, wires=[1]),
+                qml.RX(varphi, wires=[2]),
+                qml.CNOT(wires=[0, 1]),
+                qml.CNOT(wires=[1, 2]),
+            ],
+            rotations=obs.diagonalizing_gates(),
+        )
+
+        res = dev.expval(obs)
+        expected = -(np.cos(varphi) * np.sin(phi) + np.sin(varphi) * np.cos(theta)) / np.sqrt(2)
+
+        assert np.allclose(res, expected, tol)
diff --git a/mpitests/test_measurements_sparse.py b/mpitests/test_measurements_sparse.py
new file mode 100644
index 0000000000..4ea2856289
--- /dev/null
+++ b/mpitests/test_measurements_sparse.py
@@ -0,0 +1,168 @@
+# Copyright 2018-2023 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for Sparse Measurements Lightning devices.
+"""
+# pylint: disable=protected-access,too-few-public-methods,unused-import,missing-function-docstring,too-many-arguments
+
+import pytest
+from conftest import device_name, LightningDevice as ld
+from mpi4py import MPI
+
+import numpy as np
+import pennylane as qml
+from pennylane import qchem
+
+if not ld._CPP_BINARY_AVAILABLE:
+    pytest.skip("No binary module found. Skipping.", allow_module_level=True)
+
+
+class TestSparseExpval:
+    """Tests for the expval function"""
+
+    @pytest.fixture(params=[np.complex64, np.complex128])
+    def dev(self, request):
+        return qml.device(device_name, mpi=True, wires=2, c_dtype=request.param)
+
+    @pytest.mark.parametrize(
+        "cases",
+        [
+            [
+                qml.PauliX(0) @ qml.Identity(1),
+                0.00000000000000000,
+                1.000000000000000000,
+            ],
+            [
+                qml.Identity(0) @ qml.PauliX(1),
+                -0.19866933079506122,
+                0.960530638694763184,
+            ],
+            [
+                qml.PauliY(0) @ qml.Identity(1),
+                -0.38941834230865050,
+                0.848353326320648193,
+            ],
+            [
+                qml.Identity(0) @ qml.PauliY(1),
+                0.00000000000000000,
+                1.000000119209289551,
+            ],
+            [
+                qml.PauliZ(0) @ qml.Identity(1),
+                0.92106099400288520,
+                0.151646673679351807,
+            ],
+            [
+                qml.Identity(0) @ qml.PauliZ(1),
+                0.98006657784124170,
+                0.039469480514526367,
+            ],
+        ],
+    )
+    def test_sparse_Pauli_words(self, cases, tol, dev):
+        """Test expval of some simple sparse Hamiltonian"""
+
+        @qml.qnode(dev, diff_method="parameter-shift")
+        def circuit_expval():
+            qml.RX(0.4, wires=[0])
+            qml.RY(-0.2, wires=[1])
+            return qml.expval(
+                qml.SparseHamiltonian(
+                    qml.Hamiltonian([1], [cases[0]]).sparse_matrix(), wires=[0, 1]
+                )
+            )
+
+        assert np.allclose(circuit_expval(), cases[1], atol=tol, rtol=0)
+
+        @qml.qnode(dev, diff_method="parameter-shift")
+        def circuit_var():
+            qml.RX(0.4, wires=[0])
+            qml.RY(-0.2, wires=[1])
+            return qml.var(
+                qml.SparseHamiltonian(
+                    qml.Hamiltonian([1], [cases[0]]).sparse_matrix(), wires=[0, 1]
+                )
+            )
+
+        assert np.allclose(circuit_var(), cases[2], atol=tol, rtol=0)
+
+
+class TestSparseExpvalQChem:
+    """Tests for the expval function with qchem workflow"""
+
+    symbols = ["Li", "H"]
+    geometry = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 2.969280527])
+
+    H, qubits = qchem.molecular_hamiltonian(
+        symbols,
+        geometry,
+    )
+
+    active_electrons = 1
+
+    hf_state = qchem.hf_state(active_electrons, qubits)
+
+    singles, doubles = qchem.excitations(active_electrons, qubits)
+    excitations = singles + doubles
+
+    @pytest.fixture(
+        params=[np.complex64, np.complex128] if device_name != "lightning.gpu" else [np.complex128]
+    )
+    @pytest.mark.parametrize(
+        "qubits, wires, H, hf_state, excitations",
+        [
+            [qubits, range(qubits), H, hf_state, excitations],
+            [
+                qubits,
+                np.random.permutation(np.arange(qubits)),
+                H,
+                hf_state,
+                excitations,
+            ],
+        ],
+    )
+    def test_sparse_Pauli_words(self, qubits, wires, H, hf_state, excitations, tol, request):
+        """Test expval of some simple sparse Hamiltonian"""
+
+        H_sparse = H.sparse_matrix(wires)
+
+        dev = qml.device(device_name, mpi=True, wires=wires, c_dtype=request.param)
+
+        @qml.qnode(dev, diff_method="parameter-shift")
+        def circuit():
+            qml.BasisState(hf_state, wires=range(qubits))
+
+            for excitation in excitations:
+                if len(excitation) == 4:
+                    qml.DoubleExcitation(1, wires=excitation)
+                elif len(excitation) == 2:
+                    qml.SingleExcitation(1, wires=excitation)
+
+            return qml.expval(qml.SparseHamiltonian(H_sparse, wires=wires))
+
+        dev_default = qml.device("default.qubit", wires=qubits)
+
+        @qml.qnode(dev_default, diff_method="parameter-shift")
+        def circuit_default():
+            qml.BasisState(hf_state, wires=range(qubits))
+
+            for excitation in excitations:
+                if len(excitation) == 4:
+                    qml.DoubleExcitation(1, wires=excitation)
+                elif len(excitation) == 2:
+                    qml.SingleExcitation(1, wires=excitation)
+
+            return qml.expval(qml.SparseHamiltonian(H_sparse, wires=wires))
+
+        assert np.allclose(circuit(), circuit_default(), atol=tol, rtol=0)
diff --git a/mpitests/test_probs.py b/mpitests/test_probs.py
new file mode 100644
index 0000000000..f07a00ba6f
--- /dev/null
+++ b/mpitests/test_probs.py
@@ -0,0 +1,312 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for the :mod:`pennylane_lightning.LightningGPU` device (MPI).
+"""
+# pylint: disable=missing-function-docstring,unnecessary-comprehension,too-many-arguments,wrong-import-order,unused-variable,c-extension-no-member
+from mpi4py import MPI
+import pytest
+
+from conftest import (
+    device_name,
+)
+
+import numpy as np
+import pennylane as qml
+
+numQubits = 8
+
+
+def create_random_init_state(numWires, R_DTYPE, seed_value=48):
+    np.random.seed(seed_value)
+    num_elements = 1 << numWires
+    init_state = np.random.rand(num_elements).astype(R_DTYPE) + 1j * np.random.rand(
+        num_elements
+    ).astype(R_DTYPE)
+    scale_sum = np.sqrt(np.sum(np.abs(init_state) ** 2)).astype(R_DTYPE)
+    init_state = init_state / scale_sum
+    return init_state
+
+
+def apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE):
+    num_wires = numQubits
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    commSize = comm.Get_size()
+
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
+    dev_mpi = qml.device(device_name, wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+
+    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    comm.Bcast(state_vector, root=0)
+
+    def circuit():
+        qml.StatePrep(state_vector, wires=range(num_wires))
+        operation(wires=GateWires)
+        return qml.probs(wires=Wires)
+
+    cpu_qnode = qml.QNode(circuit, dev_cpu)
+    probs_cpu = cpu_qnode()
+
+    mpi_qnode = qml.QNode(circuit, dev_mpi)
+    local_probs = mpi_qnode()
+
+    recv_counts = comm.gather(len(local_probs), root=0)
+
+    comm.Barrier()
+
+    if rank == 0:
+        probs_mpi = np.zeros(1 << len(Wires)).astype(dev_mpi.R_DTYPE)
+        displacements = [i for i in range(commSize)]
+    else:
+        probs_mpi = None
+        probs_cpu = None
+    comm.Barrier()
+    comm.Gatherv(local_probs, [probs_mpi, recv_counts], root=0)
+
+    if rank == 0:
+        assert np.allclose(probs_mpi, probs_cpu, atol=tol, rtol=0)
+    comm.Barrier()
+
+
+def apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE):
+    num_wires = numQubits
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    commSize = comm.Get_size()
+
+    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=C_DTYPE)
+    dev_mpi = qml.device(device_name, wires=num_wires, mpi=True, c_dtype=C_DTYPE)
+
+    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    comm.Bcast(state_vector, root=0)
+
+    def circuit():
+        qml.StatePrep(state_vector, wires=range(num_wires))
+        operation(*par, wires=GateWires)
+        return qml.probs(wires=Wires)
+
+    cpu_qnode = qml.QNode(circuit, dev_cpu)
+    probs_cpu = cpu_qnode()
+
+    mpi_qnode = qml.QNode(circuit, dev_mpi)
+    local_probs = mpi_qnode()
+
+    recv_counts = comm.gather(len(local_probs), root=0)
+
+    comm.Barrier()
+
+    if rank == 0:
+        probs_mpi = np.zeros(1 << len(Wires)).astype(dev_mpi.R_DTYPE)
+    else:
+        probs_mpi = None
+        probs_cpu = None
+    comm.Barrier()
+
+    comm.Gatherv(local_probs, [probs_mpi, recv_counts], root=0)
+
+    if rank == 0:
+        assert np.allclose(probs_mpi, probs_cpu, atol=tol, rtol=0)
+    comm.Barrier()
+
+
+class TestProbs:
+    """Tests for the probability method."""
+
+    @pytest.mark.parametrize(
+        "operation", [qml.PauliX, qml.PauliY, qml.PauliZ, qml.Hadamard, qml.S, qml.T]
+    )
+    @pytest.mark.parametrize("GateWires", [[0], [numQubits - 1]])
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0],
+            [1],
+            [0, 1],
+            [0, 2],
+            [0, numQubits - 1],
+            [numQubits - 2, numQubits - 1],
+            range(numQubits),
+        ],
+    )
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
+    def test_prob_single_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE):
+        apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE)
+
+    @pytest.mark.parametrize("operation", [qml.CNOT, qml.SWAP, qml.CY, qml.CZ])
+    @pytest.mark.parametrize(
+        "GateWires", [[0, 1], [numQubits - 2, numQubits - 1], [0, numQubits - 1]]
+    )
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0],
+            [1],
+            [0, 1],
+            [0, 2],
+            [0, numQubits - 1],
+            [numQubits - 2, numQubits - 1],
+            range(numQubits),
+        ],
+    )
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
+    def test_prob_two_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE):
+        apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE)
+
+    @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
+    @pytest.mark.parametrize(
+        "GateWires",
+        [
+            [0, 1, 2],
+            [numQubits - 3, numQubits - 2, numQubits - 1],
+            [0, 1, numQubits - 1],
+            [0, numQubits - 2, numQubits - 1],
+        ],
+    )
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0],
+            [1],
+            [0, 1],
+            [0, 2],
+            [0, numQubits - 1],
+            [numQubits - 2, numQubits - 1],
+            range(numQubits),
+        ],
+    )
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
+    def test_prob_three_wire_nonparam(self, tol, operation, GateWires, Wires, C_DTYPE):
+        apply_probs_nonparam(tol, operation, GateWires, Wires, C_DTYPE)
+
+    @pytest.mark.parametrize("operation", [qml.PhaseShift, qml.RX, qml.RY, qml.RZ])
+    @pytest.mark.parametrize("par", [[0.1], [0.2], [0.3]])
+    @pytest.mark.parametrize("GateWires", [0, numQubits - 1])
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0],
+            [1],
+            [0, 1],
+            [0, 2],
+            [0, numQubits - 1],
+            [numQubits - 2, numQubits - 1],
+            range(numQubits),
+        ],
+    )
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
+    def test_prob_single_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
+        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+
+    @pytest.mark.parametrize("operation", [qml.Rot])
+    @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
+    @pytest.mark.parametrize("GateWires", [0, numQubits - 1])
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0],
+            [1],
+            [0, 1],
+            [0, 2],
+            [0, numQubits - 1],
+            [numQubits - 2, numQubits - 1],
+            range(numQubits),
+        ],
+    )
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
+    def test_prob_single_wire_3param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
+        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+
+    @pytest.mark.parametrize("operation", [qml.CRot])
+    @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
+    @pytest.mark.parametrize(
+        "GateWires", [[0, numQubits - 1], [0, 1], [numQubits - 2, numQubits - 1]]
+    )
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0],
+            [1],
+            [0, 1],
+            [0, 2],
+            [0, numQubits - 1],
+            [numQubits - 2, numQubits - 1],
+            range(numQubits),
+        ],
+    )
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
+    def test_prob_two_wire_3param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
+        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+
+    @pytest.mark.parametrize(
+        "operation",
+        [
+            qml.CRX,
+            qml.CRY,
+            qml.CRZ,
+            qml.ControlledPhaseShift,
+            qml.SingleExcitation,
+            qml.SingleExcitationMinus,
+            qml.SingleExcitationPlus,
+            qml.IsingXX,
+            qml.IsingYY,
+            qml.IsingZZ,
+        ],
+    )
+    @pytest.mark.parametrize("par", [[0.1], [0.2], [0.3]])
+    @pytest.mark.parametrize(
+        "GateWires", [[0, numQubits - 1], [0, 1], [numQubits - 2, numQubits - 1]]
+    )
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0],
+            [1],
+            [0, 1],
+            [0, 2],
+            [0, numQubits - 1],
+            [numQubits - 2, numQubits - 1],
+            range(numQubits),
+        ],
+    )
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
+    def test_prob_two_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
+        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
+
+    @pytest.mark.parametrize(
+        "operation",
+        [qml.DoubleExcitation, qml.DoubleExcitationMinus, qml.DoubleExcitationPlus],
+    )
+    @pytest.mark.parametrize("par", [[0.13], [0.2], [0.3]])
+    @pytest.mark.parametrize(
+        "GateWires",
+        [
+            [0, 1, numQubits - 2, numQubits - 1],
+            [0, 1, 2, 3],
+            [numQubits - 4, numQubits - 3, numQubits - 2, numQubits - 1],
+        ],
+    )
+    @pytest.mark.parametrize(
+        "Wires",
+        [
+            [0],
+            [1],
+            [0, 1],
+            [0, 2],
+            [0, numQubits - 1],
+            [numQubits - 2, numQubits - 1],
+            range(numQubits),
+        ],
+    )
+    @pytest.mark.parametrize("C_DTYPE", [np.complex128])
+    def test_prob_four_wire_param(self, tol, operation, par, GateWires, Wires, C_DTYPE):
+        apply_probs_param(tol, operation, par, GateWires, Wires, C_DTYPE)
diff --git a/pennylane_lightning/core/_serialize.py b/pennylane_lightning/core/_serialize.py
index 12981faf92..4f7b3e624c 100644
--- a/pennylane_lightning/core/_serialize.py
+++ b/pennylane_lightning/core/_serialize.py
@@ -25,6 +25,8 @@
     Identity,
     StatePrep,
     Rot,
+    Hamiltonian,
+    SparseHamiltonian,
 )
 from pennylane.operation import Tensor
 from pennylane.tape import QuantumTape
@@ -49,9 +51,10 @@ class QuantumScriptSerializer:
 
     """
 
-    # pylint: disable=import-outside-toplevel, too-many-instance-attributes
-    def __init__(self, device_name, use_csingle: bool = False):
+    # pylint: disable=import-outside-toplevel, too-many-instance-attributes, c-extension-no-member
+    def __init__(self, device_name, use_csingle: bool = False, use_mpi: bool = False):
         self.use_csingle = use_csingle
+        self.device_name = device_name
         if device_name == "lightning.qubit":
             try:
                 import pennylane_lightning.lightning_qubit_ops as lightning_ops
@@ -75,6 +78,7 @@ def __init__(self, device_name, use_csingle: bool = False):
                 ) from exception
         else:
             raise DeviceError(f'The device name "{device_name}" is not a valid option.')
+        self.statevector_c64 = lightning_ops.StateVectorC64
         self.statevector_c128 = lightning_ops.StateVectorC128
         self.named_obs_c64 = lightning_ops.observables.NamedObsC64
         self.named_obs_c128 = lightning_ops.observables.NamedObsC128
@@ -84,6 +88,26 @@ def __init__(self, device_name, use_csingle: bool = False):
         self.tensor_prod_obs_c128 = lightning_ops.observables.TensorProdObsC128
         self.hamiltonian_c64 = lightning_ops.observables.HamiltonianC64
         self.hamiltonian_c128 = lightning_ops.observables.HamiltonianC128
+        self.sparse_hamiltonian_c64 = lightning_ops.observables.SparseHamiltonianC64
+        self.sparse_hamiltonian_c128 = lightning_ops.observables.SparseHamiltonianC128
+
+        self._use_mpi = use_mpi
+
+        if self._use_mpi:
+            self.statevector_mpi_c64 = lightning_ops.StateVectorMPIC64
+            self.statevector_mpi_c128 = lightning_ops.StateVectorMPIC128
+            self.named_obs_mpi_c64 = lightning_ops.observablesMPI.NamedObsMPIC64
+            self.named_obs_mpi_c128 = lightning_ops.observablesMPI.NamedObsMPIC128
+            self.hermitian_obs_mpi_c64 = lightning_ops.observablesMPI.HermitianObsMPIC64
+            self.hermitian_obs_mpi_c128 = lightning_ops.observablesMPI.HermitianObsMPIC128
+            self.tensor_prod_obs_mpi_c64 = lightning_ops.observablesMPI.TensorProdObsMPIC64
+            self.tensor_prod_obs_mpi_c128 = lightning_ops.observablesMPI.TensorProdObsMPIC128
+            self.hamiltonian_mpi_c64 = lightning_ops.observablesMPI.HamiltonianMPIC64
+            self.hamiltonian_mpi_c128 = lightning_ops.observablesMPI.HamiltonianMPIC128
+            self.sparse_hamiltonian_mpi_c64 = lightning_ops.observablesMPI.SparseHamiltonianMPIC64
+            self.sparse_hamiltonian_mpi_c128 = lightning_ops.observablesMPI.SparseHamiltonianMPIC128
+
+            self._mpi_manager = lightning_ops.MPIManager
 
     @property
     def ctype(self):
@@ -95,26 +119,54 @@ def rtype(self):
         """Real type."""
         return np.float32 if self.use_csingle else np.float64
 
+    @property
+    def sv_type(self):
+        """State vector matching ``use_csingle`` precision (and MPI if it is supported)."""
+        if self._use_mpi:
+            return self.statevector_mpi_c64 if self.use_csingle else self.statevector_mpi_c128
+        return self.statevector_c64 if self.use_csingle else self.statevector_c128
+
     @property
     def named_obs(self):
         """Named observable matching ``use_csingle`` precision."""
+        if self._use_mpi:
+            return self.named_obs_mpi_c64 if self.use_csingle else self.named_obs_mpi_c128
         return self.named_obs_c64 if self.use_csingle else self.named_obs_c128
 
     @property
     def hermitian_obs(self):
         """Hermitian observable matching ``use_csingle`` precision."""
+        if self._use_mpi:
+            return self.hermitian_obs_mpi_c64 if self.use_csingle else self.hermitian_obs_mpi_c128
         return self.hermitian_obs_c64 if self.use_csingle else self.hermitian_obs_c128
 
     @property
     def tensor_obs(self):
         """Tensor product observable matching ``use_csingle`` precision."""
+        if self._use_mpi:
+            return (
+                self.tensor_prod_obs_mpi_c64 if self.use_csingle else self.tensor_prod_obs_mpi_c128
+            )
         return self.tensor_prod_obs_c64 if self.use_csingle else self.tensor_prod_obs_c128
 
     @property
     def hamiltonian_obs(self):
         """Hamiltonian observable matching ``use_csingle`` precision."""
+        if self._use_mpi:
+            return self.hamiltonian_mpi_c64 if self.use_csingle else self.hamiltonian_mpi_c128
         return self.hamiltonian_c64 if self.use_csingle else self.hamiltonian_c128
 
+    @property
+    def sparse_hamiltonian_obs(self):
+        """SparseHamiltonian observable matching ``use_csingle`` precision."""
+        if self._use_mpi:
+            return (
+                self.sparse_hamiltonian_mpi_c64
+                if self.use_csingle
+                else self.sparse_hamiltonian_mpi_c128
+            )
+        return self.sparse_hamiltonian_c64 if self.use_csingle else self.sparse_hamiltonian_c128
+
     def _named_obs(self, observable, wires_map: dict):
         """Serializes a Named observable"""
         wires = [wires_map[w] for w in observable.wires]
@@ -139,6 +191,37 @@ def _hamiltonian(self, observable, wires_map: dict):
         terms = [self._ob(t, wires_map) for t in observable.ops]
         return self.hamiltonian_obs(coeffs, terms)
 
+    def _sparse_hamiltonian(self, observable, wires_map: dict):
+        """Serialize an observable (Sparse Hamiltonian)
+
+        Args:
+            observable (Observable): the input observable (Sparse Hamiltonian)
+            wire_map (dict): a dictionary mapping input wires to the device's backend wires
+
+        Returns:
+            sparse_hamiltonian_obs (SparseHamiltonianC64 or SparseHamiltonianC128): A Sparse Hamiltonian observable object compatible with the C++ backend
+        """
+
+        if self._use_mpi:
+            Hmat = Hamiltonian([1.0], [Identity(0)]).sparse_matrix()
+            H_sparse = SparseHamiltonian(Hmat, wires=range(1))
+            spm = H_sparse.sparse_matrix()
+            # Only root 0 needs the overall sparsematrix data
+            if self._mpi_manager().getRank() == 0:
+                spm = observable.sparse_matrix()
+            self._mpi_manager().Barrier()
+        else:
+            spm = observable.sparse_matrix()
+        data = np.array(spm.data).astype(self.ctype)
+        indices = np.array(spm.indices).astype(np.int64)
+        offsets = np.array(spm.indptr).astype(np.int64)
+
+        wires = []
+        wires_list = observable.wires.tolist()
+        wires.extend([wires_map[w] for w in wires_list])
+
+        return self.sparse_hamiltonian_obs(data, indices, offsets, wires)
+
     def _pauli_word(self, observable, wires_map: dict):
         """Serialize a :class:`pennylane.pauli.PauliWord` into a Named or Tensor observable."""
         if len(observable) == 1:
@@ -166,6 +249,8 @@ def _ob(self, observable, wires_map):
             return self._tensor_ob(observable, wires_map)
         if observable.name == "Hamiltonian":
             return self._hamiltonian(observable, wires_map)
+        if observable.name == "SparseHamiltonian":
+            return self._sparse_hamiltonian(observable, wires_map)
         if isinstance(observable, (PauliX, PauliY, PauliZ, Identity, Hadamard)):
             return self._named_obs(observable, wires_map)
         if observable._pauli_rep is not None:
@@ -223,7 +308,7 @@ def serialize_ops(
                 name = single_op.name
                 names.append(name)
 
-                if not hasattr(self.statevector_c128, name):
+                if not hasattr(self.sv_type, name):
                     params.append([])
                     mats.append(matrix(single_op))
 
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index b31d7d35ba..d276e92d29 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.33.0-dev23"
+__version__ = "0.33.0-dev24"
diff --git a/pennylane_lightning/core/lightning_base.py b/pennylane_lightning/core/lightning_base.py
index 33a3aa59e2..9587ce3dd2 100644
--- a/pennylane_lightning/core/lightning_base.py
+++ b/pennylane_lightning/core/lightning_base.py
@@ -255,14 +255,14 @@ def _get_basis_state_index(self, state, wires):
         return int(qml.math.dot(state, basis_states))
 
     # pylint: disable=too-many-function-args, assignment-from-no-return
-    def _process_jacobian_tape(self, tape, starting_state, use_device_state):
+    def _process_jacobian_tape(self, tape, starting_state, use_device_state, use_mpi: bool = False):
         state_vector = self._init_process_jacobian_tape(tape, starting_state, use_device_state)
 
         obs_serialized = QuantumScriptSerializer(
-            self.short_name, self.use_csingle
+            self.short_name, self.use_csingle, use_mpi
         ).serialize_observables(tape, self.wire_map)
         ops_serialized, use_sp = QuantumScriptSerializer(
-            self.short_name, self.use_csingle
+            self.short_name, self.use_csingle, use_mpi
         ).serialize_ops(tape, self.wire_map)
 
         ops_serialized = self.create_ops_list(*ops_serialized)
diff --git a/pennylane_lightning/core/src/bindings/Bindings.cpp b/pennylane_lightning/core/src/bindings/Bindings.cpp
index 70a192b394..425a5ea096 100644
--- a/pennylane_lightning/core/src/bindings/Bindings.cpp
+++ b/pennylane_lightning/core/src/bindings/Bindings.cpp
@@ -16,6 +16,9 @@
  * Export C++ functions to Python using Pybind.
  */
 #include "Bindings.hpp"
+#ifdef _ENABLE_PLGPU_MPI
+#include "BindingsMPI.hpp"
+#endif
 #include "pybind11/pybind11.h"
 
 // Defining the module name.
@@ -55,6 +58,11 @@ PYBIND11_MODULE(
     registerBackendSpecificInfo(m);
 
     registerLightningClassBindings<StateVectorBackends>(m);
+
+#ifdef _ENABLE_PLGPU_MPI
+    registerBackendSpecificInfoMPI(m);
+    registerLightningClassBindingsMPI<StateVectorMPIBackends>(m);
+#endif
 }
 
 #endif
\ No newline at end of file
diff --git a/pennylane_lightning/core/src/bindings/Bindings.hpp b/pennylane_lightning/core/src/bindings/Bindings.hpp
index 30c994a719..1368ee3a94 100644
--- a/pennylane_lightning/core/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/core/src/bindings/Bindings.hpp
@@ -287,7 +287,8 @@ void registerInfo(py::module_ &m) {
  * @tparam StateVectorT
  * @param m Pybind module
  */
-template <class StateVectorT> void registerObservables(py::module_ &m) {
+template <class StateVectorT>
+void registerBackendAgnosticObservables(py::module_ &m) {
     using PrecisionT =
         typename StateVectorT::PrecisionT; // Statevector's precision.
     using ComplexT =
@@ -627,7 +628,8 @@ template <class StateVectorT> void lightningClassBindings(py::module_ &m) {
     /* Observables submodule */
     py::module_ obs_submodule =
         m.def_submodule("observables", "Submodule for observables classes.");
-    registerObservables<StateVectorT>(obs_submodule);
+    registerBackendAgnosticObservables<StateVectorT>(obs_submodule);
+    registerBackendSpecificObservables<StateVectorT>(obs_submodule);
 
     //***********************************************************************//
     //                             Measurements
diff --git a/pennylane_lightning/core/src/bindings/BindingsMPI.hpp b/pennylane_lightning/core/src/bindings/BindingsMPI.hpp
new file mode 100644
index 0000000000..41276afe5d
--- /dev/null
+++ b/pennylane_lightning/core/src/bindings/BindingsMPI.hpp
@@ -0,0 +1,483 @@
+// Copyright 2018-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file Bindings.hpp
+ * Defines device-agnostic operations to export to Python and other utility
+ * functions interfacing with Pybind11.
+ */
+
+#pragma once
+#include <set>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+
+#include "CPUMemoryModel.hpp" // CPUMemoryModel, getMemoryModel, bestCPUMemoryModel, getAlignment
+#include "JacobianData.hpp"
+#include "Macros.hpp" // CPUArch
+#include "Memory.hpp" // alignedAlloc
+#include "Observables.hpp"
+#include "Util.hpp" // for_each_enum
+
+#ifdef _ENABLE_PLGPU
+#include "AdjointJacobianGPUMPI.hpp"
+#include "JacobianDataMPI.hpp"
+#include "LGPUBindingsMPI.hpp"
+#include "MeasurementsGPUMPI.hpp"
+#include "ObservablesGPUMPI.hpp"
+
+/// @cond DEV
+namespace {
+using namespace Pennylane::LightningGPU;
+using namespace Pennylane::LightningGPU::Algorithms;
+using namespace Pennylane::LightningGPU::Observables;
+using namespace Pennylane::LightningGPU::Measures;
+} // namespace
+  /// @endcond
+
+#else
+
+static_assert(false, "Backend not found.");
+
+#endif
+
+namespace py = pybind11;
+
+namespace Pennylane {
+/**
+ * @brief Register observable classes.
+ *
+ * @tparam StateVectorT
+ * @param m Pybind module
+ */
+template <class StateVectorT> void registerObservablesMPI(py::module_ &m) {
+    using PrecisionT =
+        typename StateVectorT::PrecisionT; // Statevector's precision.
+    using ComplexT =
+        typename StateVectorT::ComplexT; // Statevector's complex type.
+    using ParamT = PrecisionT;           // Parameter's data precision
+
+    const std::string bitsize =
+        std::to_string(sizeof(std::complex<PrecisionT>) * 8);
+
+    using np_arr_c = py::array_t<std::complex<ParamT>, py::array::c_style>;
+    using np_arr_r = py::array_t<ParamT, py::array::c_style>;
+    using np_arr_sparse_ind = typename std::conditional<
+        std::is_same<ParamT, float>::value,
+        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
+        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
+
+    std::string class_name;
+
+    class_name = "ObservableMPIC" + bitsize;
+    py::class_<Observable<StateVectorT>,
+               std::shared_ptr<Observable<StateVectorT>>>(m, class_name.c_str(),
+                                                          py::module_local());
+
+    class_name = "NamedObsMPIC" + bitsize;
+    py::class_<NamedObsMPI<StateVectorT>,
+               std::shared_ptr<NamedObsMPI<StateVectorT>>,
+               Observable<StateVectorT>>(m, class_name.c_str(),
+                                         py::module_local())
+        .def(py::init(
+            [](const std::string &name, const std::vector<size_t> &wires) {
+                return NamedObsMPI<StateVectorT>(name, wires);
+            }))
+        .def("__repr__", &NamedObsMPI<StateVectorT>::getObsName)
+        .def("get_wires", &NamedObsMPI<StateVectorT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const NamedObsMPI<StateVectorT> &self,
+               py::handle other) -> bool {
+                if (!py::isinstance<NamedObsMPI<StateVectorT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<NamedObsMPI<StateVectorT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+
+    class_name = "HermitianObsMPIC" + bitsize;
+    py::class_<HermitianObsMPI<StateVectorT>,
+               std::shared_ptr<HermitianObsMPI<StateVectorT>>,
+               Observable<StateVectorT>>(m, class_name.c_str(),
+                                         py::module_local())
+        .def(py::init(
+            [](const np_arr_c &matrix, const std::vector<size_t> &wires) {
+                auto buffer = matrix.request();
+                const auto *ptr = static_cast<ComplexT *>(buffer.ptr);
+                return HermitianObsMPI<StateVectorT>(
+                    std::vector<ComplexT>(ptr, ptr + buffer.size), wires);
+            }))
+        .def("__repr__", &HermitianObsMPI<StateVectorT>::getObsName)
+        .def("get_wires", &HermitianObsMPI<StateVectorT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const HermitianObsMPI<StateVectorT> &self,
+               py::handle other) -> bool {
+                if (!py::isinstance<HermitianObsMPI<StateVectorT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<HermitianObsMPI<StateVectorT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+
+    class_name = "TensorProdObsMPIC" + bitsize;
+    py::class_<TensorProdObsMPI<StateVectorT>,
+               std::shared_ptr<TensorProdObsMPI<StateVectorT>>,
+               Observable<StateVectorT>>(m, class_name.c_str(),
+                                         py::module_local())
+        .def(py::init(
+            [](const std::vector<std::shared_ptr<Observable<StateVectorT>>>
+                   &obs) { return TensorProdObsMPI<StateVectorT>(obs); }))
+        .def("__repr__", &TensorProdObsMPI<StateVectorT>::getObsName)
+        .def("get_wires", &TensorProdObsMPI<StateVectorT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const TensorProdObsMPI<StateVectorT> &self,
+               py::handle other) -> bool {
+                if (!py::isinstance<TensorProdObsMPI<StateVectorT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<TensorProdObsMPI<StateVectorT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+
+    class_name = "HamiltonianMPIC" + bitsize;
+    using ObsPtr = std::shared_ptr<Observable<StateVectorT>>;
+    py::class_<HamiltonianMPI<StateVectorT>,
+               std::shared_ptr<HamiltonianMPI<StateVectorT>>,
+               Observable<StateVectorT>>(m, class_name.c_str(),
+                                         py::module_local())
+        .def(py::init(
+            [](const np_arr_r &coeffs, const std::vector<ObsPtr> &obs) {
+                auto buffer = coeffs.request();
+                const auto ptr = static_cast<const ParamT *>(buffer.ptr);
+                return HamiltonianMPI<StateVectorT>{
+                    std::vector(ptr, ptr + buffer.size), obs};
+            }))
+        .def("__repr__", &HamiltonianMPI<StateVectorT>::getObsName)
+        .def("get_wires", &HamiltonianMPI<StateVectorT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const HamiltonianMPI<StateVectorT> &self,
+               py::handle other) -> bool {
+                if (!py::isinstance<HamiltonianMPI<StateVectorT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<HamiltonianMPI<StateVectorT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+#ifdef _ENABLE_PLGPU
+    class_name = "SparseHamiltonianMPIC" + bitsize;
+    using SpIDX = typename SparseHamiltonianMPI<StateVectorT>::IdxT;
+    py::class_<SparseHamiltonianMPI<StateVectorT>,
+               std::shared_ptr<SparseHamiltonianMPI<StateVectorT>>,
+               Observable<StateVectorT>>(m, class_name.c_str(),
+                                         py::module_local())
+        .def(py::init([](const np_arr_c &data, const np_arr_sparse_ind &indices,
+                         const np_arr_sparse_ind &offsets,
+                         const std::vector<std::size_t> &wires) {
+            const py::buffer_info buffer_data = data.request();
+            const auto *data_ptr = static_cast<ComplexT *>(buffer_data.ptr);
+
+            const py::buffer_info buffer_indices = indices.request();
+            const auto *indices_ptr = static_cast<SpIDX *>(buffer_indices.ptr);
+
+            const py::buffer_info buffer_offsets = offsets.request();
+            const auto *offsets_ptr = static_cast<SpIDX *>(buffer_offsets.ptr);
+
+            return SparseHamiltonianMPI<StateVectorT>{
+                std::vector<ComplexT>({data_ptr, data_ptr + data.size()}),
+                std::vector<SpIDX>({indices_ptr, indices_ptr + indices.size()}),
+                std::vector<SpIDX>({offsets_ptr, offsets_ptr + offsets.size()}),
+                wires};
+        }))
+        .def("__repr__", &SparseHamiltonianMPI<StateVectorT>::getObsName)
+        .def("get_wires", &SparseHamiltonianMPI<StateVectorT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const SparseHamiltonianMPI<StateVectorT> &self,
+               py::handle other) -> bool {
+                if (!py::isinstance<SparseHamiltonianMPI<StateVectorT>>(
+                        other)) {
+                    return false;
+                }
+                auto other_cast =
+                    other.cast<SparseHamiltonianMPI<StateVectorT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+#endif
+}
+
+/**
+ * @brief Register agnostic measurements class functionalities.
+ *
+ * @tparam StateVectorT
+ * @tparam PyClass
+ * @param pyclass Pybind11's measurements class to bind methods.
+ */
+template <class StateVectorT, class PyClass>
+void registerBackendAgnosticMeasurementsMPI(PyClass &pyclass) {
+    using PrecisionT =
+        typename StateVectorT::PrecisionT; // Statevector's precision.
+    using ParamT = PrecisionT;             // Parameter's data precision
+
+    pyclass
+        .def("probs",
+             [](MeasurementsMPI<StateVectorT> &M,
+                const std::vector<size_t> &wires) {
+                 return py::array_t<ParamT>(py::cast(M.probs(wires)));
+             })
+        .def("probs",
+             [](MeasurementsMPI<StateVectorT> &M) {
+                 return py::array_t<ParamT>(py::cast(M.probs()));
+             })
+        .def(
+            "expval",
+            [](MeasurementsMPI<StateVectorT> &M,
+               const std::shared_ptr<Observable<StateVectorT>> &ob) {
+                return M.expval(*ob);
+            },
+            "Expected value of an observable object.")
+        .def(
+            "var",
+            [](MeasurementsMPI<StateVectorT> &M,
+               const std::shared_ptr<Observable<StateVectorT>> &ob) {
+                return M.var(*ob);
+            },
+            "Variance of an observable object.")
+        .def("generate_samples", [](MeasurementsMPI<StateVectorT> &M,
+                                    size_t num_wires, size_t num_shots) {
+            auto &&result = M.generate_samples(num_shots);
+            const size_t ndim = 2;
+            const std::vector<size_t> shape{num_shots, num_wires};
+            constexpr auto sz = sizeof(size_t);
+            const std::vector<size_t> strides{sz * num_wires, sz};
+            // return 2-D NumPy array
+            return py::array(py::buffer_info(
+                result.data(), /* data as contiguous array  */
+                sz,            /* size of one scalar        */
+                py::format_descriptor<size_t>::format(), /* data type */
+                ndim,   /* number of dimensions      */
+                shape,  /* shape of the matrix       */
+                strides /* strides for each axis     */
+                ));
+        });
+}
+
+/**
+ * @brief Register the adjoint Jacobian method.
+ */
+template <class StateVectorT>
+auto registerAdjointJacobianMPI(
+    AdjointJacobianMPI<StateVectorT> &adjoint_jacobian, const StateVectorT &sv,
+    const std::vector<std::shared_ptr<Observable<StateVectorT>>> &observables,
+    const OpsData<StateVectorT> &operations,
+    const std::vector<size_t> &trainableParams)
+    -> py::array_t<typename StateVectorT::PrecisionT> {
+    using PrecisionT = typename StateVectorT::PrecisionT;
+    std::vector<PrecisionT> jac(observables.size() * trainableParams.size(),
+                                PrecisionT{0.0});
+    const JacobianDataMPI<StateVectorT> jd{operations.getTotalNumParams(), sv,
+                                           observables, operations,
+                                           trainableParams};
+    adjoint_jacobian.adjointJacobian(std::span{jac}, jd, sv);
+    return py::array_t<PrecisionT>(py::cast(jac));
+}
+
+/**
+ * @brief Register agnostic algorithms.
+ *
+ * @tparam StateVectorT
+ * @param m Pybind module
+ */
+template <class StateVectorT>
+void registerBackendAgnosticAlgorithmsMPI(py::module_ &m) {
+    using PrecisionT =
+        typename StateVectorT::PrecisionT; // Statevector's precision
+    using ComplexT =
+        typename StateVectorT::ComplexT; // Statevector's complex type
+    using ParamT = PrecisionT;           // Parameter's data precision
+
+    using np_arr_c = py::array_t<std::complex<ParamT>, py::array::c_style>;
+
+    const std::string bitsize =
+        std::to_string(sizeof(std::complex<PrecisionT>) * 8);
+
+    std::string class_name;
+
+    //***********************************************************************//
+    //                              Operations
+    //***********************************************************************//
+
+    class_name = "OpsStructMPIC" + bitsize;
+    py::class_<OpsData<StateVectorT>>(m, class_name.c_str(), py::module_local())
+        .def(py::init<const std::vector<std::string> &,
+                      const std::vector<std::vector<ParamT>> &,
+                      const std::vector<std::vector<size_t>> &,
+                      const std::vector<bool> &,
+                      const std::vector<std::vector<ComplexT>> &>())
+        .def("__repr__", [](const OpsData<StateVectorT> &ops) {
+            using namespace Pennylane::Util;
+            std::ostringstream ops_stream;
+            for (size_t op = 0; op < ops.getSize(); op++) {
+                ops_stream << "{'name': " << ops.getOpsName()[op];
+                ops_stream << ", 'params': " << ops.getOpsParams()[op];
+                ops_stream << ", 'inv': " << ops.getOpsInverses()[op];
+                ops_stream << "}";
+                if (op < ops.getSize() - 1) {
+                    ops_stream << ",";
+                }
+            }
+            return "Operations: [" + ops_stream.str() + "]";
+        });
+
+    /**
+     * Create operation list.
+     */
+    std::string function_name = "create_ops_listMPIC" + bitsize;
+    m.def(
+        function_name.c_str(),
+        [](const std::vector<std::string> &ops_name,
+           const std::vector<std::vector<PrecisionT>> &ops_params,
+           const std::vector<std::vector<size_t>> &ops_wires,
+           const std::vector<bool> &ops_inverses,
+           const std::vector<np_arr_c> &ops_matrices) {
+            std::vector<std::vector<ComplexT>> conv_matrices(
+                ops_matrices.size());
+            for (size_t op = 0; op < ops_name.size(); op++) {
+                const auto m_buffer = ops_matrices[op].request();
+                if (m_buffer.size) {
+                    const auto m_ptr =
+                        static_cast<const ComplexT *>(m_buffer.ptr);
+                    conv_matrices[op] =
+                        std::vector<ComplexT>{m_ptr, m_ptr + m_buffer.size};
+                }
+            }
+            return OpsData<StateVectorT>{ops_name, ops_params, ops_wires,
+                                         ops_inverses, conv_matrices};
+        },
+        "Create a list of operations from data.");
+
+    //***********************************************************************//
+    //                            Adjoint Jacobian
+    //***********************************************************************//
+    class_name = "AdjointJacobianMPIC" + bitsize;
+    py::class_<AdjointJacobianMPI<StateVectorT>>(m, class_name.c_str(),
+                                                 py::module_local())
+        .def(py::init<>())
+        .def(
+            "batched",
+            [](AdjointJacobianMPI<StateVectorT> &adjoint_jacobian,
+               const StateVectorT &sv,
+               const std::vector<std::shared_ptr<Observable<StateVectorT>>>
+                   &observables,
+               const OpsData<StateVectorT> &operations,
+               const std::vector<size_t> &trainableParams) {
+                using PrecisionT = typename StateVectorT::PrecisionT;
+                std::vector<PrecisionT> jac(observables.size() *
+                                                trainableParams.size(),
+                                            PrecisionT{0.0});
+                const JacobianDataMPI<StateVectorT> jd{
+                    operations.getTotalNumParams(), sv, observables, operations,
+                    trainableParams};
+                adjoint_jacobian.adjointJacobian_serial(std::span{jac}, jd);
+                return py::array_t<PrecisionT>(py::cast(jac));
+            },
+            "Batch Adjoint Jacobian method.")
+        .def("__call__", &registerAdjointJacobianMPI<StateVectorT>,
+             "Adjoint Jacobian method.");
+}
+
+/**
+ * @brief Templated class to build lightning class bindings.
+ *
+ * @tparam StateVectorT State vector type
+ * @param m Pybind11 module.
+ */
+template <class StateVectorT> void lightningClassBindingsMPI(py::module_ &m) {
+    using PrecisionT =
+        typename StateVectorT::PrecisionT; // Statevector's precision.
+    // Enable module name to be based on size of complex datatype
+    const std::string bitsize =
+        std::to_string(sizeof(std::complex<PrecisionT>) * 8);
+
+    //***********************************************************************//
+    //                              StateVector
+    //***********************************************************************//
+    std::string class_name = "StateVectorMPIC" + bitsize;
+    auto pyclass =
+        py::class_<StateVectorT>(m, class_name.c_str(), py::module_local());
+    pyclass.def_property_readonly("size", &StateVectorT::getLength);
+
+    registerBackendClassSpecificBindingsMPI<StateVectorT>(pyclass);
+
+    //***********************************************************************//
+    //                              Observables
+    //***********************************************************************//
+
+    py::module_ obs_submodule =
+        m.def_submodule("observablesMPI", "Submodule for observables classes.");
+    registerObservablesMPI<StateVectorT>(obs_submodule);
+
+    //***********************************************************************//
+    //                             Measurements
+    //***********************************************************************//
+
+    class_name = "MeasurementsMPIC" + bitsize;
+    auto pyclass_measurements = py::class_<MeasurementsMPI<StateVectorT>>(
+        m, class_name.c_str(), py::module_local());
+
+    pyclass_measurements.def(py::init<StateVectorT &>());
+    registerBackendAgnosticMeasurementsMPI<StateVectorT>(pyclass_measurements);
+    registerBackendSpecificMeasurementsMPI<StateVectorT>(pyclass_measurements);
+
+    //***********************************************************************//
+    //                           Algorithms
+    //***********************************************************************//
+
+    py::module_ alg_submodule = m.def_submodule(
+        "algorithmsMPI", "Submodule for the algorithms functionality.");
+    registerBackendAgnosticAlgorithmsMPI<StateVectorT>(alg_submodule);
+    registerBackendSpecificAlgorithmsMPI<StateVectorT>(alg_submodule);
+}
+
+template <typename TypeList>
+void registerLightningClassBindingsMPI(py::module_ &m) {
+    if constexpr (!std::is_same_v<TypeList, void>) {
+        using StateVectorT = typename TypeList::Type;
+        lightningClassBindingsMPI<StateVectorT>(m);
+        registerLightningClassBindingsMPI<typename TypeList::Next>(m);
+    }
+}
+} // namespace Pennylane
diff --git a/pennylane_lightning/core/src/observables/Observables.hpp b/pennylane_lightning/core/src/observables/Observables.hpp
index 42227f62e5..183a9fcd82 100644
--- a/pennylane_lightning/core/src/observables/Observables.hpp
+++ b/pennylane_lightning/core/src/observables/Observables.hpp
@@ -414,4 +414,112 @@ class HamiltonianBase : public Observable<StateVectorT> {
     }
 };
 
+/**
+ * @brief Sparse representation of SparseHamiltonian<T>
+ *
+ * @tparam T Floating-point precision.
+ */
+template <class StateVectorT>
+class SparseHamiltonianBase : public Observable<StateVectorT> {
+  public:
+    using PrecisionT = typename StateVectorT::PrecisionT;
+    using ComplexT = typename StateVectorT::ComplexT;
+#ifdef _ENABLE_PLGPU
+    using IdxT =
+        typename std::conditional<std::is_same<PrecisionT, float>::value,
+                                  int32_t, int64_t>::type;
+#else
+    using IdxT = std::size_t;
+#endif
+
+  protected:
+    std::vector<ComplexT> data_;
+    std::vector<IdxT> indices_;
+    std::vector<IdxT> offsets_;
+    std::vector<std::size_t> wires_;
+
+  private:
+    [[nodiscard]] bool
+    isEqual(const Observable<StateVectorT> &other) const override {
+        const auto &other_cast =
+            static_cast<const SparseHamiltonianBase<StateVectorT> &>(other);
+        return data_ == other_cast.data_ && indices_ == other_cast.indices_ &&
+               offsets_ == other_cast.offsets_ && (wires_ == other_cast.wires_);
+    }
+
+  public:
+    /**
+     * @brief Create a SparseHamiltonianBase from data, indices and offsets in
+     * CSR format.
+     *
+     * @param data Arguments to construct data
+     * @param indices Arguments to construct indices
+     * @param offsets Arguments to construct offsets
+     * @param wires Arguments to construct wires
+     */
+    template <typename T1, typename T2, typename T3 = T2,
+              typename T4 = std::vector<std::size_t>>
+    SparseHamiltonianBase(T1 &&data, T2 &&indices, T3 &&offsets, T4 &&wires)
+        : data_{std::forward<T1>(data)}, indices_{std::forward<T2>(indices)},
+          offsets_{std::forward<T3>(offsets)}, wires_{std::forward<T4>(wires)} {
+        PL_ASSERT(data_.size() == indices_.size());
+    }
+
+    /**
+     * @brief Convenient wrapper for the constructor as the constructor does not
+     * convert the std::shared_ptr with a derived class correctly.
+     *
+     * This function is useful as std::make_shared does not handle
+     * brace-enclosed initializer list correctly.
+     *
+     * @param data Argument to construct data
+     * @param indices Argument to construct indices
+     * @param offsets Argument to construct offsets
+     * @param wires Argument to construct wires
+     */
+    static auto create(std::initializer_list<ComplexT> data,
+                       std::initializer_list<IdxT> indices,
+                       std::initializer_list<IdxT> offsets,
+                       std::initializer_list<std::size_t> wires)
+        -> std::shared_ptr<SparseHamiltonianBase<StateVectorT>> {
+        // NOLINTBEGIN(*-move-const-arg)
+        return std::shared_ptr<SparseHamiltonianBase<StateVectorT>>(
+            new SparseHamiltonianBase<StateVectorT>{
+                std::move(data), std::move(indices), std::move(offsets),
+                std::move(wires)});
+        // NOLINTEND(*-move-const-arg)
+    }
+
+    void applyInPlace([[maybe_unused]] StateVectorT &sv) const override {
+        PL_ABORT("For SparseHamiltonian Observables, the applyInPlace method "
+                 "must be "
+                 "defined at the backend level.");
+    }
+
+    [[nodiscard]] auto getObsName() const -> std::string override {
+        using Pennylane::Util::operator<<;
+        std::ostringstream ss;
+        ss << "SparseHamiltonian: {\n'data' : \n";
+        for (const auto &d : data_) {
+            ss << "{" << d.real() << ", " << d.imag() << "}, ";
+        }
+        ss << ",\n'indices' : \n";
+        for (const auto &i : indices_) {
+            ss << i << ", ";
+        }
+        ss << ",\n'offsets' : \n";
+        for (const auto &o : offsets_) {
+            ss << o << ", ";
+        }
+        ss << "\n}";
+        return ss.str();
+    }
+    /**
+     * @brief Get the wires the observable applies to.
+     */
+    [[nodiscard]] auto getWires() const -> std::vector<size_t> override {
+        return wires_;
+    };
+};
+
 } // namespace Pennylane::Observables
\ No newline at end of file
diff --git a/pennylane_lightning/core/src/observables/tests/Test_Observables.cpp b/pennylane_lightning/core/src/observables/tests/Test_Observables.cpp
index 2c4e3b60da..d0bb9799b1 100644
--- a/pennylane_lightning/core/src/observables/tests/Test_Observables.cpp
+++ b/pennylane_lightning/core/src/observables/tests/Test_Observables.cpp
@@ -30,8 +30,8 @@
 /// @cond DEV
 namespace {
 using namespace Pennylane::Observables;
-
 using Pennylane::Util::createProductState;
+using Pennylane::Util::createRandomStateVectorData;
 using Pennylane::Util::createZeroState;
 using Pennylane::Util::isApproxEqual;
 using Pennylane::Util::LightningException;
@@ -464,3 +464,84 @@ TEST_CASE("Methods implemented in the HamiltonianBase class",
         testHamiltonianBase<TestStateVectorBackends>();
     }
 }
+
+template <typename TypeList> void testSparseHamiltonianBase() {
+    if constexpr (!std::is_same_v<TypeList, void>) {
+        using StateVectorT = typename TypeList::Type;
+        using PrecisionT = typename StateVectorT::PrecisionT;
+        using ComplexT = typename StateVectorT::ComplexT;
+
+        const std::size_t num_qubits = 3;
+        std::mt19937 re{1337};
+
+        auto sparseH = SparseHamiltonianBase<StateVectorT>::create(
+            {ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+             ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+             ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}},
+            {7, 6, 5, 4, 3, 2, 1, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8}, {0, 1, 2});
+
+        DYNAMIC_SECTION("SparseHamiltonianBase - isEqual - "
+                        << StateVectorToName<StateVectorT>::name) {
+            auto sparseH0 = SparseHamiltonianBase<StateVectorT>::create(
+                {ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}},
+                {7, 6, 5, 4, 3, 2, 1, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8},
+                {0, 1, 2});
+            auto sparseH1 = SparseHamiltonianBase<StateVectorT>::create(
+                {ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}},
+                {7, 6, 5, 4, 3, 2, 1, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8},
+                {0, 1, 2});
+            auto sparseH2 = SparseHamiltonianBase<StateVectorT>::create(
+                {ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}},
+                {8, 6, 5, 4, 3, 2, 1, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8},
+                {0, 1, 2});
+
+            REQUIRE(*sparseH0 == *sparseH1);
+            REQUIRE(*sparseH0 != *sparseH2);
+        }
+
+        DYNAMIC_SECTION("SparseHamiltonianBase - getWires - "
+                        << StateVectorToName<StateVectorT>::name) {
+            REQUIRE(sparseH->getWires() == std::vector<size_t>{0, 1, 2});
+        }
+
+        DYNAMIC_SECTION("SparseHamiltonianBase - getObsName - "
+                        << StateVectorToName<StateVectorT>::name) {
+            REQUIRE(sparseH->getObsName() ==
+                    "SparseHamiltonian: {\n"
+                    "'data' : \n"
+                    "{1, 0}, {1, 0}, {1, 0}, {1, 0}, {1, 0}, {1, 0}, {1, 0}, "
+                    "{1, 0}, ,\n"
+                    "'indices' : \n"
+                    "7, 6, 5, 4, 3, 2, 1, 0, ,\n"
+                    "'offsets' : \n"
+                    "0, 1, 2, 3, 4, 5, 6, 7, 8, \n"
+                    "}");
+        }
+
+        DYNAMIC_SECTION("SparseHamiltonianBase - applyInPlace must fail - "
+                        << StateVectorToName<StateVectorT>::name) {
+            auto init_state =
+                createRandomStateVectorData<PrecisionT>(re, num_qubits);
+
+            StateVectorT state_vector(init_state.data(), init_state.size());
+
+            REQUIRE_THROWS_AS(sparseH->applyInPlace(state_vector),
+                              LightningException);
+        }
+
+        testSparseHamiltonianBase<typename TypeList::Next>();
+    }
+}
+
+TEST_CASE("Methods implemented in the SparseHamiltonianBase class",
+          "[SparseHamiltonianBase]") {
+    if constexpr (BACKEND_FOUND) {
+        testSparseHamiltonianBase<TestStateVectorBackends>();
+    }
+}
\ No newline at end of file
diff --git a/pennylane_lightning/core/src/observables/tests/mpi/Test_ObservablesMPI.cpp b/pennylane_lightning/core/src/observables/tests/mpi/Test_ObservablesMPI.cpp
index fdcfa8c2ea..201952efa5 100644
--- a/pennylane_lightning/core/src/observables/tests/mpi/Test_ObservablesMPI.cpp
+++ b/pennylane_lightning/core/src/observables/tests/mpi/Test_ObservablesMPI.cpp
@@ -524,3 +524,91 @@ TEST_CASE("Methods implemented in the HamiltonianBase class",
         testHamiltonianBase<TestStateVectorMPIBackends>();
     }
 }
+
+template <typename TypeList> void testSparseHamiltonianBase() {
+    if constexpr (!std::is_same_v<TypeList, void>) {
+        using StateVectorT = typename TypeList::Type;
+        using PrecisionT = typename StateVectorT::PrecisionT;
+        using ComplexT = typename StateVectorT::ComplexT;
+
+        const std::size_t num_qubits = 3;
+        std::mt19937 re{1337};
+
+        MPIManager mpi_manager(MPI_COMM_WORLD);
+
+        size_t mpi_buffersize = 1;
+        size_t nGlobalIndexBits =
+            std::bit_width(static_cast<size_t>(mpi_manager.getSize())) - 1;
+        size_t nLocalIndexBits = num_qubits - nGlobalIndexBits;
+        size_t subSvLength = 1 << nLocalIndexBits;
+
+        int nDevices = 0;
+        cudaGetDeviceCount(&nDevices);
+        int deviceId = mpi_manager.getRank() % nDevices;
+        cudaSetDevice(deviceId);
+        DevTag<int> dt_local(deviceId, 0);
+        mpi_manager.Barrier();
+
+        std::vector<ComplexT> expected_sv(subSvLength);
+        std::vector<ComplexT> local_state(subSvLength);
+
+        auto init_state =
+            createRandomStateVectorData<PrecisionT>(re, num_qubits);
+
+        mpi_manager.Scatter(init_state.data(), local_state.data(), subSvLength,
+                            0);
+        mpi_manager.Barrier();
+
+        DYNAMIC_SECTION("applyInPlace must fail - "
+                        << StateVectorMPIToName<StateVectorT>::name) {
+            auto sparseH = SparseHamiltonianBase<StateVectorT>::create(
+                {ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}},
+                {7, 6, 5, 4, 3, 2, 1, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8},
+                {0, 1, 2});
+
+            StateVectorT sv_mpi(mpi_manager, dt_local, mpi_buffersize,
+                                nGlobalIndexBits, nLocalIndexBits);
+
+            sv_mpi.CopyHostDataToGpu(local_state, false);
+
+            REQUIRE_THROWS_AS(sparseH->applyInPlace(sv_mpi),
+                              LightningException);
+        }
+
+        DYNAMIC_SECTION("SparseHamiltonianBase - isEqual - "
+                        << StateVectorMPIToName<StateVectorT>::name) {
+            auto sparseH0 = SparseHamiltonianBase<StateVectorT>::create(
+                {ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}},
+                {7, 6, 5, 4, 3, 2, 1, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8},
+                {0, 1, 2});
+            auto sparseH1 = SparseHamiltonianBase<StateVectorT>::create(
+                {ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}},
+                {7, 6, 5, 4, 3, 2, 1, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8},
+                {0, 1, 2});
+            auto sparseH2 = SparseHamiltonianBase<StateVectorT>::create(
+                {ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+                 ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}},
+                {8, 6, 5, 4, 3, 2, 1, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8},
+                {0, 1, 2});
+
+            REQUIRE(*sparseH0 == *sparseH1);
+            REQUIRE(*sparseH0 != *sparseH2);
+        }
+
+        testSparseHamiltonianBase<typename TypeList::Next>();
+    }
+}
+
+TEST_CASE("Methods implemented in the SparseHamiltonianBase class",
+          "[SparseHamiltonianBase]") {
+    if constexpr (BACKEND_FOUND) {
+        testSparseHamiltonianBase<TestStateVectorMPIBackends>();
+    }
+}
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_gpu/CMakeLists.txt
index adec26a5af..5d5a336f98 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/CMakeLists.txt
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/CMakeLists.txt
@@ -31,6 +31,10 @@ add_library(${PL_BACKEND} STATIC ${LGPU_FILES})
 
 target_compile_options(lightning_compile_options INTERFACE "-D_ENABLE_PLGPU=1")
 
+if(ENABLE_MPI)
+    target_compile_options(lightning_compile_options INTERFACE "-D_ENABLE_PLGPU_MPI=1")
+endif()
+
 ##########################
 ## Enforce C++ Standard ##
 ##########################
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index e713ea2eef..2ebf7d3f95 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -188,10 +188,13 @@ void registerBackendSpecificMeasurements(PyClass &pyclass) {
 
     using np_arr_c = py::array_t<std::complex<ParamT>,
                                  py::array::c_style | py::array::forcecast>;
-    using sparse_index_type = std::size_t;
-    using np_arr_sparse_ind =
-        py::array_t<sparse_index_type,
-                    py::array::c_style | py::array::forcecast>;
+    using sparse_index_type =
+        typename std::conditional<std::is_same<ParamT, float>::value, int32_t,
+                                  int64_t>::type;
+    using np_arr_sparse_ind = typename std::conditional<
+        std::is_same<ParamT, float>::value,
+        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
+        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
 
     pyclass
         .def("expval",
@@ -205,10 +208,14 @@ void registerBackendSpecificMeasurements(PyClass &pyclass) {
                const np_arr_sparse_ind &entries, const np_arr_c &values) {
                 return M.expval(
                     static_cast<sparse_index_type *>(row_map.request().ptr),
-                    static_cast<sparse_index_type>(row_map.request().size),
+                    static_cast<int64_t>(
+                        row_map.request()
+                            .size), // int64_t is required by cusparse
                     static_cast<sparse_index_type *>(entries.request().ptr),
                     static_cast<ComplexT *>(values.request().ptr),
-                    static_cast<sparse_index_type>(values.request().size));
+                    static_cast<int64_t>(
+                        values.request()
+                            .size)); // int64_t is required by cusparse
             },
             "Expected value of a sparse Hamiltonian.")
         .def(
@@ -249,14 +256,81 @@ void registerBackendSpecificMeasurements(PyClass &pyclass) {
                const np_arr_sparse_ind &entries, const np_arr_c &values) {
                 return M.var(
                     static_cast<sparse_index_type *>(row_map.request().ptr),
-                    static_cast<sparse_index_type>(row_map.request().size),
+                    static_cast<int64_t>(row_map.request().size),
                     static_cast<sparse_index_type *>(entries.request().ptr),
                     static_cast<ComplexT *>(values.request().ptr),
-                    static_cast<sparse_index_type>(values.request().size));
+                    static_cast<int64_t>(values.request().size));
             },
             "Variance of a sparse Hamiltonian.");
 }
 
+/**
+ * @brief Register backend specific observables.
+ *
+ * @tparam StateVectorT
+ * @param m Pybind module
+ */
+template <class StateVectorT>
+void registerBackendSpecificObservables(py::module_ &m) {
+    using PrecisionT =
+        typename StateVectorT::PrecisionT; // Statevector's precision.
+    using ComplexT =
+        typename StateVectorT::ComplexT; // Statevector's complex type.
+    using ParamT = PrecisionT;           // Parameter's data precision
+
+    const std::string bitsize =
+        std::to_string(sizeof(std::complex<PrecisionT>) * 8);
+
+    using np_arr_c = py::array_t<std::complex<ParamT>, py::array::c_style>;
+
+    std::string class_name;
+
+    class_name = "SparseHamiltonianC" + bitsize;
+    using np_arr_sparse_ind = typename std::conditional<
+        std::is_same<ParamT, float>::value,
+        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
+        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
+    using IdxT = typename SparseHamiltonian<StateVectorT>::IdxT;
+    py::class_<SparseHamiltonian<StateVectorT>,
+               std::shared_ptr<SparseHamiltonian<StateVectorT>>,
+               Observable<StateVectorT>>(m, class_name.c_str(),
+                                         py::module_local())
+        .def(py::init([](const np_arr_c &data, const np_arr_sparse_ind &indices,
+                         const np_arr_sparse_ind &offsets,
+                         const std::vector<std::size_t> &wires) {
+            const py::buffer_info buffer_data = data.request();
+            const auto *data_ptr = static_cast<ComplexT *>(buffer_data.ptr);
+
+            const py::buffer_info buffer_indices = indices.request();
+            const auto *indices_ptr =
+                static_cast<std::size_t *>(buffer_indices.ptr);
+
+            const py::buffer_info buffer_offsets = offsets.request();
+            const auto *offsets_ptr =
+                static_cast<std::size_t *>(buffer_offsets.ptr);
+
+            return SparseHamiltonian<StateVectorT>{
+                std::vector<ComplexT>({data_ptr, data_ptr + data.size()}),
+                std::vector<IdxT>({indices_ptr, indices_ptr + indices.size()}),
+                std::vector<IdxT>({offsets_ptr, offsets_ptr + offsets.size()}),
+                wires};
+        }))
+        .def("__repr__", &SparseHamiltonian<StateVectorT>::getObsName)
+        .def("get_wires", &SparseHamiltonian<StateVectorT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const SparseHamiltonian<StateVectorT> &self,
+               py::handle other) -> bool {
+                if (!py::isinstance<SparseHamiltonian<StateVectorT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<SparseHamiltonian<StateVectorT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+}
+
 /**
  * @brief Register backend specific adjoint Jacobian methods.
  *
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
new file mode 100644
index 0000000000..1ca4670fe7
--- /dev/null
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -0,0 +1,323 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <set>
+#include <tuple>
+#include <variant>
+#include <vector>
+
+#include "cuda.h"
+
+#include "BindingsBase.hpp"
+#include "Constant.hpp"
+#include "ConstantUtil.hpp" // lookup
+#include "DevTag.hpp"
+#include "DevicePool.hpp"
+#include "Error.hpp"
+#include "MPIManager.hpp"
+#include "MeasurementsGPUMPI.hpp"
+#include "ObservablesGPUMPI.hpp"
+#include "StateVectorCudaMPI.hpp"
+#include "TypeList.hpp"
+#include "cuda_helpers.hpp"
+
+/// @cond DEV
+namespace {
+using namespace Pennylane;
+using namespace Pennylane::Bindings;
+using namespace Pennylane::LightningGPU::Algorithms;
+using namespace Pennylane::LightningGPU::Measures;
+using namespace Pennylane::LightningGPU::Observables;
+using Pennylane::LightningGPU::StateVectorCudaMPI;
+} // namespace
+/// @endcond
+
+namespace py = pybind11;
+
+namespace Pennylane::LightningGPU {
+using StateVectorMPIBackends =
+    Pennylane::Util::TypeList<StateVectorCudaMPI<float>,
+                              StateVectorCudaMPI<double>, void>;
+
+/**
+ * @brief Get a gate kernel map for a statevector.
+ */
+
+template <class StateVectorT, class PyClass>
+void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
+    using PrecisionT =
+        typename StateVectorT::PrecisionT; // Statevector's precision
+    using CFP_t =
+        typename StateVectorT::CFP_t; // Statevector's complex precision
+    using ParamT = PrecisionT;        // Parameter's data precision
+    using np_arr_c = py::array_t<std::complex<ParamT>,
+                                 py::array::c_style | py::array::forcecast>;
+    using np_arr_sparse_ind = typename std::conditional<
+        std::is_same<ParamT, float>::value,
+        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
+        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
+
+    registerGatesForStateVector<StateVectorT>(pyclass);
+
+    pyclass
+        .def(
+            py::init([](MPIManager &mpi_manager, const DevTag<int> devtag_local,
+                        std::size_t mpi_buf_size, std::size_t num_global_qubits,
+                        std::size_t num_local_qubits) {
+                return new StateVectorT(mpi_manager, devtag_local, mpi_buf_size,
+                                        num_global_qubits, num_local_qubits);
+            })) // qubits, device
+        .def(py::init(
+            [](const DevTag<int> devtag_local, std::size_t mpi_buf_size,
+               std::size_t num_global_qubits, std::size_t num_local_qubits) {
+                return new StateVectorT(devtag_local, mpi_buf_size,
+                                        num_global_qubits, num_local_qubits);
+            })) // qubits, device
+        .def(
+            "setBasisState",
+            [](StateVectorT &sv, const size_t index, const bool use_async) {
+                const std::complex<PrecisionT> value(1, 0);
+                sv.setBasisState(value, index, use_async);
+            },
+            "Create Basis State on GPU.")
+        .def(
+            "setStateVector",
+            [](StateVectorT &sv, const np_arr_sparse_ind &indices,
+               const np_arr_c &state, const bool use_async) {
+                using index_type = typename std::conditional<
+                    std::is_same<ParamT, float>::value, int32_t, int64_t>::type;
+
+                sv.template setStateVector<index_type>(
+                    static_cast<index_type>(indices.request().size),
+                    static_cast<std::complex<PrecisionT> *>(
+                        state.request().ptr),
+                    static_cast<index_type *>(indices.request().ptr),
+                    use_async);
+            },
+            "Set State Vector on GPU with values and their corresponding "
+            "indices for the state vector on device")
+        .def(
+            "DeviceToDevice",
+            [](StateVectorT &sv, const StateVectorT &other, bool async) {
+                sv.updateData(other, async);
+            },
+            "Synchronize data from another GPU device to current device.")
+        .def("DeviceToHost",
+             py::overload_cast<std::complex<PrecisionT> *, size_t, bool>(
+                 &StateVectorT::CopyGpuDataToHost, py::const_),
+             "Synchronize data from the GPU device to host.")
+        .def(
+            "DeviceToHost",
+            [](const StateVectorT &gpu_sv, np_arr_c &cpu_sv, bool) {
+                py::buffer_info numpyArrayInfo = cpu_sv.request();
+                auto *data_ptr =
+                    static_cast<std::complex<PrecisionT> *>(numpyArrayInfo.ptr);
+                if (cpu_sv.size()) {
+                    gpu_sv.CopyGpuDataToHost(data_ptr, cpu_sv.size());
+                }
+            },
+            "Synchronize data from the GPU device to host.")
+        .def("HostToDevice",
+             py::overload_cast<const std::complex<PrecisionT> *, size_t, bool>(
+                 &StateVectorT::CopyHostDataToGpu),
+             "Synchronize data from the host device to GPU.")
+        .def("HostToDevice",
+             py::overload_cast<const std::vector<std::complex<PrecisionT>> &,
+                               bool>(&StateVectorT::CopyHostDataToGpu),
+             "Synchronize data from the host device to GPU.")
+        .def(
+            "HostToDevice",
+            [](StateVectorT &gpu_sv, const np_arr_c &cpu_sv, bool async) {
+                const py::buffer_info numpyArrayInfo = cpu_sv.request();
+                const auto *data_ptr =
+                    static_cast<std::complex<PrecisionT> *>(numpyArrayInfo.ptr);
+                const auto length =
+                    static_cast<size_t>(numpyArrayInfo.shape[0]);
+                if (length) {
+                    gpu_sv.CopyHostDataToGpu(data_ptr, length, async);
+                }
+            },
+            "Synchronize data from the host device to GPU.")
+        .def("GetNumGPUs", &getGPUCount, "Get the number of available GPUs.")
+        .def("getCurrentGPU", &getGPUIdx,
+             "Get the GPU index for the statevector data.")
+        .def("numQubits", &StateVectorT::getNumQubits)
+        .def("dataLength", &StateVectorT::getLength)
+        .def("resetGPU", &StateVectorT::initSV)
+        .def(
+            "apply",
+            [](StateVectorT &sv, const std::string &str,
+               const std::vector<size_t> &wires, bool inv,
+               [[maybe_unused]] const std::vector<std::vector<ParamT>> &params,
+               [[maybe_unused]] const np_arr_c &gate_matrix) {
+                const auto m_buffer = gate_matrix.request();
+                std::vector<CFP_t> matrix_cu;
+                if (m_buffer.size) {
+                    const auto m_ptr = static_cast<const CFP_t *>(m_buffer.ptr);
+                    matrix_cu =
+                        std::vector<CFP_t>{m_ptr, m_ptr + m_buffer.size};
+                }
+                sv.applyOperation(str, wires, inv, std::vector<ParamT>{},
+                                  matrix_cu);
+            },
+            "Apply operation via the gate matrix");
+}
+
+/**
+ * @brief Register backend specific measurements class functionalities.
+ *
+ * @tparam StateVectorT
+ * @tparam PyClass
+ * @param pyclass Pybind11's measurements class to bind methods.
+ */
+template <class StateVectorT, class PyClass>
+void registerBackendSpecificMeasurementsMPI(PyClass &pyclass) {
+    using PrecisionT =
+        typename StateVectorT::PrecisionT; // Statevector's precision
+    using ComplexT =
+        typename StateVectorT::ComplexT; // Statevector's complex type
+    using ParamT = PrecisionT;           // Parameter's data precision
+
+    using np_arr_c = py::array_t<std::complex<ParamT>,
+                                 py::array::c_style | py::array::forcecast>;
+    using sparse_index_type =
+        typename std::conditional<std::is_same<ParamT, float>::value, int32_t,
+                                  int64_t>::type;
+    using np_arr_sparse_ind = typename std::conditional<
+        std::is_same<ParamT, float>::value,
+        py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
+        py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;
+
+    pyclass
+        .def("expval",
+             static_cast<PrecisionT (MeasurementsMPI<StateVectorT>::*)(
+                 const std::string &, const std::vector<size_t> &)>(
+                 &MeasurementsMPI<StateVectorT>::expval),
+             "Expected value of an operation by name.")
+        .def(
+            "expval",
+            [](MeasurementsMPI<StateVectorT> &M,
+               const np_arr_sparse_ind &row_map,
+               const np_arr_sparse_ind &entries, const np_arr_c &values) {
+                return M.expval(
+                    static_cast<sparse_index_type *>(row_map.request().ptr),
+                    static_cast<int64_t>(row_map.request().size),
+                    static_cast<sparse_index_type *>(entries.request().ptr),
+                    static_cast<ComplexT *>(values.request().ptr),
+                    static_cast<int64_t>(values.request().size));
+            },
+            "Expected value of a sparse Hamiltonian.")
+        .def(
+            "expval",
+            [](MeasurementsMPI<StateVectorT> &M,
+               const std::vector<std::string> &pauli_words,
+               const std::vector<std::vector<size_t>> &target_wires,
+               const np_arr_c &coeffs) {
+                return M.expval(pauli_words, target_wires,
+                                static_cast<ComplexT *>(coeffs.request().ptr));
+            },
+            "Expected value of Hamiltonian represented by Pauli words.")
+        .def(
+            "expval",
+            [](MeasurementsMPI<StateVectorT> &M, const np_arr_c &matrix,
+               const std::vector<size_t> &wires) {
+                const std::size_t matrix_size = exp2(2 * wires.size());
+                auto matrix_data =
+                    static_cast<ComplexT *>(matrix.request().ptr);
+                std::vector<ComplexT> matrix_v{matrix_data,
+                                               matrix_data + matrix_size};
+                return M.expval(matrix_v, wires);
+            },
+            "Expected value of a Hermitian observable.")
+        .def("var",
+             [](MeasurementsMPI<StateVectorT> &M, const std::string &operation,
+                const std::vector<size_t> &wires) {
+                 return M.var(operation, wires);
+             })
+        .def("var",
+             static_cast<PrecisionT (MeasurementsMPI<StateVectorT>::*)(
+                 const std::string &, const std::vector<size_t> &)>(
+                 &MeasurementsMPI<StateVectorT>::var),
+             "Variance of an operation by name.")
+        .def(
+            "var",
+            [](MeasurementsMPI<StateVectorT> &M,
+               const np_arr_sparse_ind &row_map,
+               const np_arr_sparse_ind &entries, const np_arr_c &values) {
+                return M.var(
+                    static_cast<sparse_index_type *>(row_map.request().ptr),
+                    static_cast<int64_t>(row_map.request().size),
+                    static_cast<sparse_index_type *>(entries.request().ptr),
+                    static_cast<ComplexT *>(values.request().ptr),
+                    static_cast<int64_t>(values.request().size));
+            },
+            "Variance of a sparse Hamiltonian.");
+}
+
+/**
+ * @brief Register backend specific adjoint Jacobian methods.
+ *
+ * @tparam StateVectorT
+ * @param m Pybind module
+ */
+template <class StateVectorT>
+void registerBackendSpecificAlgorithmsMPI([[maybe_unused]] py::module_ &m) {}
+
+/**
+ * @brief Register bindings for backend-specific info.
+ *
+ * @param m Pybind11 module.
+ */
+void registerBackendSpecificInfoMPI(py::module_ &m) {
+    using np_arr_c64 = py::array_t<std::complex<float>,
+                                   py::array::c_style | py::array::forcecast>;
+    using np_arr_c128 = py::array_t<std::complex<double>,
+                                    py::array::c_style | py::array::forcecast>;
+    py::class_<MPIManager>(m, "MPIManager")
+        .def(py::init<>())
+        .def(py::init<MPIManager &>())
+        .def("Barrier", &MPIManager::Barrier)
+        .def("getRank", &MPIManager::getRank)
+        .def("getSize", &MPIManager::getSize)
+        .def("getSizeNode", &MPIManager::getSizeNode)
+        .def("getTime", &MPIManager::getTime)
+        .def("getVendor", &MPIManager::getVendor)
+        .def("getVersion", &MPIManager::getVersion)
+        .def(
+            "Scatter",
+            [](MPIManager &mpi_manager, np_arr_c64 &sendBuf,
+               np_arr_c64 &recvBuf, int root) {
+                auto send_ptr =
+                    static_cast<std::complex<float> *>(sendBuf.request().ptr);
+                auto recv_ptr =
+                    static_cast<std::complex<float> *>(recvBuf.request().ptr);
+                mpi_manager.template Scatter<std::complex<float>>(
+                    send_ptr, recv_ptr, recvBuf.request().size, root);
+            },
+            "MPI Scatter.")
+        .def(
+            "Scatter",
+            [](MPIManager &mpi_manager, np_arr_c128 &sendBuf,
+               np_arr_c128 &recvBuf, int root) {
+                auto send_ptr =
+                    static_cast<std::complex<double> *>(sendBuf.request().ptr);
+                auto recv_ptr =
+                    static_cast<std::complex<double> *>(recvBuf.request().ptr);
+                mpi_manager.template Scatter<std::complex<double>>(
+                    send_ptr, recv_ptr, recvBuf.request().size, root);
+            },
+            "MPI Scatter.");
+}
+} // namespace Pennylane::LightningGPU
+  /// @endcond
\ No newline at end of file
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
index af2ae38ec2..8ca6eacc69 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
@@ -259,10 +259,10 @@ class Measurements final
      * @return auto Expectation value.
      */
     template <class index_type>
-    auto expval(const index_type *csrOffsets_ptr,
-                const index_type csrOffsets_size, const index_type *columns_ptr,
+    auto expval(const index_type *csrOffsets_ptr, const int64_t csrOffsets_size,
+                const index_type *columns_ptr,
                 const std::complex<PrecisionT> *values_ptr,
-                const index_type numNNZ) -> PrecisionT {
+                const int64_t numNNZ) -> PrecisionT {
         const std::size_t nIndexBits = this->_statevector.getNumQubits();
         const std::size_t length = std::size_t{1} << nIndexBits;
 
@@ -580,10 +580,10 @@ class Measurements final
      * @return Floating point with the variance of the sparse Hamiltonian.
      */
     template <class index_type>
-    PrecisionT
-    var(const index_type *csrOffsets_ptr, const index_type csrOffsets_size,
-        const index_type *columns_ptr,
-        const std::complex<PrecisionT> *values_ptr, const index_type numNNZ) {
+    PrecisionT var(const index_type *csrOffsets_ptr,
+                   const int64_t csrOffsets_size, const index_type *columns_ptr,
+                   const std::complex<PrecisionT> *values_ptr,
+                   const int64_t numNNZ) {
         PL_ABORT_IF(
             (this->_statevector.getLength() != (size_t(csrOffsets_size) - 1)),
             "Statevector and Hamiltonian have incompatible sizes.");
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
index f96e2bc217..ff101654df 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPUMPI.hpp
@@ -47,6 +47,7 @@ using namespace Pennylane;
 using namespace Pennylane::Measures;
 using namespace Pennylane::Observables;
 using namespace Pennylane::LightningGPU::Observables;
+using namespace Pennylane::LightningGPU::MPI;
 namespace cuUtil = Pennylane::LightningGPU::Util;
 using Pennylane::LightningGPU::StateVectorCudaManaged;
 using namespace Pennylane::Util;
@@ -366,10 +367,10 @@ class MeasurementsMPI final
      * @return auto Expectation value.
      */
     template <class index_type>
-    auto expval(const index_type *csrOffsets_ptr,
-                const index_type csrOffsets_size, const index_type *columns_ptr,
+    auto expval(const index_type *csrOffsets_ptr, const int64_t csrOffsets_size,
+                const index_type *columns_ptr,
                 const std::complex<PrecisionT> *values_ptr,
-                const index_type numNNZ) -> PrecisionT {
+                const int64_t numNNZ) -> PrecisionT {
         if (mpi_manager_.getRank() == 0) {
             PL_ABORT_IF_NOT(
                 static_cast<size_t>(csrOffsets_size - 1) ==
@@ -657,10 +658,10 @@ class MeasurementsMPI final
      * @return Floating point with the variance of the sparse Hamiltonian.
      */
     template <class index_type>
-    PrecisionT
-    var(const index_type *csrOffsets_ptr, const index_type csrOffsets_size,
-        const index_type *columns_ptr,
-        const std::complex<PrecisionT> *values_ptr, const index_type numNNZ) {
+    PrecisionT var(const index_type *csrOffsets_ptr,
+                   const int64_t csrOffsets_size, const index_type *columns_ptr,
+                   const std::complex<PrecisionT> *values_ptr,
+                   const int64_t numNNZ) {
         if (mpi_manager_.getRank() == 0) {
             PL_ABORT_IF_NOT(
                 static_cast<size_t>(csrOffsets_size - 1) ==
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Expval.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Expval.cpp
index 82b4d41e93..36f1f1f128 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Expval.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/Test_StateVectorCudaManaged_Expval.cpp
@@ -336,6 +336,8 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::Hamiltonian_expval_Sparse",
                    "[StateVectorCudaManaged_Expval]", float, double) {
     using StateVectorT = StateVectorCudaManaged<TestType>;
     using ComplexT = StateVectorT::ComplexT;
+    using IdxT = typename std::conditional<std::is_same<TestType, float>::value,
+                                           int32_t, int64_t>::type;
 
     SECTION("Sparse expval") {
         std::vector<ComplexT> init_state{{0.0, 0.0}, {0.0, 0.1}, {0.1, 0.1},
@@ -344,17 +346,18 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::Hamiltonian_expval_Sparse",
         StateVectorT sv{init_state.data(), init_state.size()};
         auto m = Measurements(sv);
 
-        std::vector<size_t> index_ptr = {0, 2, 4, 6, 8, 10, 12, 14, 16};
-        std::vector<size_t> indices = {0, 3, 1, 2, 1, 2, 0, 3,
-                                       4, 7, 5, 6, 5, 6, 4, 7};
+        std::vector<IdxT> index_ptr = {0, 2, 4, 6, 8, 10, 12, 14, 16};
+        std::vector<IdxT> indices = {0, 3, 1, 2, 1, 2, 0, 3,
+                                     4, 7, 5, 6, 5, 6, 4, 7};
         std::vector<ComplexT> values = {
             {3.1415, 0.0},  {0.0, -3.1415}, {3.1415, 0.0}, {0.0, 3.1415},
             {0.0, -3.1415}, {3.1415, 0.0},  {0.0, 3.1415}, {3.1415, 0.0},
             {3.1415, 0.0},  {0.0, -3.1415}, {3.1415, 0.0}, {0.0, 3.1415},
             {0.0, -3.1415}, {3.1415, 0.0},  {0.0, 3.1415}, {3.1415, 0.0}};
 
-        auto result = m.expval(index_ptr.data(), index_ptr.size(),
-                               indices.data(), values.data(), values.size());
+        auto result = m.expval(
+            index_ptr.data(), static_cast<int64_t>(index_ptr.size()),
+            indices.data(), values.data(), static_cast<int64_t>(values.size()));
         auto expected = TestType(3.1415);
         CHECK(expected == Approx(result).epsilon(1e-7));
     }
@@ -372,22 +375,23 @@ TEMPLATE_TEST_CASE("StateVectorCudaManaged::Hamiltonian_expval_Sparse",
         // measurements.
         Measurements<StateVectorT> Measurer(sv);
         const size_t num_qubits = 3;
-        const size_t data_size = Pennylane::Util::exp2(num_qubits);
+        size_t data_size = Pennylane::Util::exp2(num_qubits);
 
-        std::vector<size_t> row_map;
-        std::vector<size_t> entries;
+        std::vector<IdxT> row_map;
+        std::vector<IdxT> entries;
         std::vector<ComplexT> values;
-        write_CSR_vectors(row_map, entries, values, data_size);
+        write_CSR_vectors<ComplexT, IdxT>(row_map, entries, values,
+                                          static_cast<IdxT>(data_size));
 
-        PrecisionT exp_values =
-            Measurer.expval(row_map.data(), row_map.size(), entries.data(),
-                            values.data(), values.size());
+        PrecisionT exp_values = Measurer.expval(
+            row_map.data(), static_cast<int64_t>(row_map.size()),
+            entries.data(), values.data(), static_cast<int64_t>(values.size()));
         PrecisionT exp_values_ref = 0.5930885;
         REQUIRE(exp_values == Approx(exp_values_ref).margin(1e-6));
 
-        PrecisionT var_values =
-            Measurer.var(row_map.data(), row_map.size(), entries.data(),
-                         values.data(), values.size());
+        PrecisionT var_values = Measurer.var(
+            row_map.data(), static_cast<int64_t>(row_map.size()),
+            entries.data(), values.data(), static_cast<int64_t>(values.size()));
         PrecisionT var_values_ref = 2.4624654;
         REQUIRE(var_values == Approx(var_values_ref).margin(1e-6));
     }
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Expval.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Expval.cpp
index 47505e73fe..b6fdab8737 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Expval.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/tests/mpi/Test_StateVectorCudaMPI_Expval.cpp
@@ -401,6 +401,8 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_Sparse",
                    "[StateVectorCudaMPI_Expval]", double) {
     using StateVectorT = StateVectorCudaMPI<TestType>;
     using ComplexT = StateVectorT::ComplexT;
+    using IdxT = typename std::conditional<std::is_same<TestType, float>::value,
+                                           int32_t, int64_t>::type;
 
     MPIManager mpi_manager(MPI_COMM_WORLD);
     REQUIRE(mpi_manager.getSize() == 2);
@@ -431,17 +433,18 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_Sparse",
         sv.CopyHostDataToGpu(local_init_sv.data(), local_init_sv.size(), false);
         auto m = MeasurementsMPI(sv);
 
-        std::vector<size_t> index_ptr = {0, 2, 4, 6, 8, 10, 12, 14, 16};
-        std::vector<size_t> indices = {0, 3, 1, 2, 1, 2, 0, 3,
-                                       4, 7, 5, 6, 5, 6, 4, 7};
+        std::vector<IdxT> index_ptr = {0, 2, 4, 6, 8, 10, 12, 14, 16};
+        std::vector<IdxT> indices = {0, 3, 1, 2, 1, 2, 0, 3,
+                                     4, 7, 5, 6, 5, 6, 4, 7};
         std::vector<ComplexT> values = {
             {3.1415, 0.0},  {0.0, -3.1415}, {3.1415, 0.0}, {0.0, 3.1415},
             {0.0, -3.1415}, {3.1415, 0.0},  {0.0, 3.1415}, {3.1415, 0.0},
             {3.1415, 0.0},  {0.0, -3.1415}, {3.1415, 0.0}, {0.0, 3.1415},
             {0.0, -3.1415}, {3.1415, 0.0},  {0.0, 3.1415}, {3.1415, 0.0}};
 
-        auto result = m.expval(index_ptr.data(), index_ptr.size(),
-                               indices.data(), values.data(), values.size());
+        auto result = m.expval(
+            index_ptr.data(), static_cast<int64_t>(index_ptr.size()),
+            indices.data(), values.data(), static_cast<int64_t>(values.size()));
         auto expected = TestType(3.1415);
         CHECK(expected == Approx(result).epsilon(1e-7));
     }
@@ -464,24 +467,25 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Hamiltonian_expval_Sparse",
         // This object attaches to the statevector allowing several
         // measurements.
         MeasurementsMPI<StateVectorT> Measurer(sv);
-        const size_t data_size = Pennylane::Util::exp2(num_qubits);
+        size_t data_size = Pennylane::Util::exp2(num_qubits);
 
-        std::vector<size_t> row_map;
-        std::vector<size_t> entries;
+        std::vector<IdxT> row_map;
+        std::vector<IdxT> entries;
         std::vector<ComplexT> values;
-        write_CSR_vectors(row_map, entries, values, data_size);
+        write_CSR_vectors<ComplexT, IdxT>(row_map, entries, values,
+                                          static_cast<IdxT>(data_size));
 
-        PrecisionT exp_values =
-            Measurer.expval(row_map.data(), row_map.size(), entries.data(),
-                            values.data(), values.size());
+        PrecisionT exp_values = Measurer.expval(
+            row_map.data(), static_cast<int64_t>(row_map.size()),
+            entries.data(), values.data(), static_cast<int64_t>(values.size()));
         PrecisionT exp_values_ref = 0.5930885;
         REQUIRE(exp_values == Approx(exp_values_ref).margin(1e-6));
 
         mpi_manager.Barrier();
 
-        PrecisionT var_values =
-            Measurer.var(row_map.data(), row_map.size(), entries.data(),
-                         values.data(), values.size());
+        PrecisionT var_values = Measurer.var(
+            row_map.data(), static_cast<int64_t>(row_map.size()),
+            entries.data(), values.data(), static_cast<int64_t>(values.size()));
         PrecisionT var_values_ref = 2.4624654;
         REQUIRE(var_values == Approx(var_values_ref).margin(1e-6));
     }
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPU.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPU.cpp
index b186caa39e..c4f2ca82d4 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPU.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPU.cpp
@@ -28,3 +28,6 @@ template class Observables::TensorProdObs<StateVectorCudaManaged<double>>;
 
 template class Observables::Hamiltonian<StateVectorCudaManaged<float>>;
 template class Observables::Hamiltonian<StateVectorCudaManaged<double>>;
+
+template class Observables::SparseHamiltonian<StateVectorCudaManaged<float>>;
+template class Observables::SparseHamiltonian<StateVectorCudaManaged<double>>;
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPU.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPU.hpp
index c92e7dac5d..6d0d0a94e7 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPU.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPU.hpp
@@ -209,4 +209,87 @@ class Hamiltonian final : public HamiltonianBase<StateVectorT> {
     }
 };
 
+/**
+ * @brief Sparse representation of Hamiltonian<StateVectorT>
+ *
+ */
+template <class StateVectorT>
+class SparseHamiltonian final : public SparseHamiltonianBase<StateVectorT> {
+  private:
+    using BaseType = SparseHamiltonianBase<StateVectorT>;
+
+  public:
+    using PrecisionT = typename StateVectorT::PrecisionT;
+    using ComplexT = typename StateVectorT::ComplexT;
+    // cuSparse required index type
+    using IdxT = typename BaseType::IdxT;
+
+    /**
+     * @brief Create a SparseHamiltonian from data, indices and offsets in CSR
+     * format.
+     *
+     * @param data Arguments to construct data
+     * @param indices Arguments to construct indices
+     * @param offsets Arguments to construct offsets
+     * @param wires Arguments to construct wires
+     */
+    template <typename T1, typename T2, typename T3 = T2, typename T4>
+    explicit SparseHamiltonian(T1 &&data, T2 &&indices, T3 &&offsets,
+                               T4 &&wires)
+        : BaseType{data, indices, offsets, wires} {}
+
+    /**
+     * @brief Convenient wrapper for the constructor as the constructor does not
+     * convert the std::shared_ptr with a derived class correctly.
+     *
+     * This function is useful as std::make_shared does not handle
+     * brace-enclosed initializer list correctly.
+     *
+     * @param data Argument to construct data
+     * @param indices Argument to construct indices
+     * @param offsets Argument to construct ofsets
+     * @param wires Argument to construct wires
+     */
+    static auto create(std::initializer_list<ComplexT> data,
+                       std::initializer_list<IdxT> indices,
+                       std::initializer_list<IdxT> offsets,
+                       std::initializer_list<std::size_t> wires)
+        -> std::shared_ptr<SparseHamiltonian<StateVectorT>> {
+        return std::shared_ptr<SparseHamiltonian<StateVectorT>>(
+            new SparseHamiltonian<StateVectorT>{
+                std::move(data), std::move(indices), std::move(offsets),
+                std::move(wires)});
+    }
+
+    /**
+     * @brief Updates the statevector SV:->SV', where SV' = a*H*SV, and where H
+     * is a sparse Hamiltonian.
+     *
+     */
+    void applyInPlace(StateVectorT &sv) const override {
+        PL_ABORT_IF_NOT(this->wires_.size() == sv.getNumQubits(),
+                        "SparseH wire count does not match state-vector size");
+        using CFP_t = typename StateVectorT::CFP_t;
+
+        const std::size_t nIndexBits = sv.getNumQubits();
+        const std::size_t length = std::size_t{1} << nIndexBits;
+
+        auto device_id = sv.getDataBuffer().getDevTag().getDeviceID();
+        auto stream_id = sv.getDataBuffer().getDevTag().getStreamID();
+
+        cusparseHandle_t handle = sv.getCusparseHandle();
+
+        std::unique_ptr<DataBuffer<CFP_t>> d_sv_prime =
+            std::make_unique<DataBuffer<CFP_t>>(length, device_id, stream_id,
+                                                true);
+
+        SparseMV_cuSparse<IdxT, PrecisionT, CFP_t>(
+            this->offsets_.data(), static_cast<int64_t>(this->offsets_.size()),
+            this->indices_.data(), this->data_.data(),
+            static_cast<int64_t>(this->data_.size()), sv.getData(),
+            d_sv_prime->getData(), device_id, stream_id, handle);
+        sv.updateData(std::move(d_sv_prime));
+    }
+};
+
 } // namespace Pennylane::LightningGPU::Observables
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPUMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPUMPI.cpp
index 9b1f776e0f..ae9ac9100a 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPUMPI.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPUMPI.cpp
@@ -28,3 +28,6 @@ template class Observables::TensorProdObsMPI<StateVectorCudaMPI<double>>;
 
 template class Observables::HamiltonianMPI<StateVectorCudaMPI<float>>;
 template class Observables::HamiltonianMPI<StateVectorCudaMPI<double>>;
+
+template class Observables::SparseHamiltonianMPI<StateVectorCudaMPI<float>>;
+template class Observables::SparseHamiltonianMPI<StateVectorCudaMPI<double>>;
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPUMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPUMPI.hpp
index d15df18207..94f5e45739 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPUMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/ObservablesGPUMPI.hpp
@@ -21,6 +21,7 @@
 #include "Constant.hpp"
 #include "ConstantUtil.hpp" // lookup
 #include "LinearAlg.hpp"
+#include "MPILinearAlg.hpp"
 #include "Observables.hpp"
 #include "StateVectorCudaMPI.hpp"
 #include "Util.hpp"
@@ -213,4 +214,97 @@ class HamiltonianMPI final : public HamiltonianBase<StateVectorT> {
     }
 };
 
+/**
+ * @brief Sparse representation of Hamiltonian<StateVectorT>
+ *
+ */
+template <class StateVectorT>
+class SparseHamiltonianMPI final : public SparseHamiltonianBase<StateVectorT> {
+  public:
+    using PrecisionT = typename StateVectorT::PrecisionT;
+    using ComplexT = typename StateVectorT::ComplexT;
+    // cuSparse required index type
+    using IdxT =
+        typename std::conditional<std::is_same<PrecisionT, float>::value,
+                                  int32_t, int64_t>::type;
+
+  private:
+    using BaseType = SparseHamiltonianBase<StateVectorT>;
+
+  public:
+    /**
+     * @brief Create a SparseHamiltonianMPI from data, indices and offsets in
+     * CSR format.
+     *
+     * @param data Arguments to construct data
+     * @param indices Arguments to construct indices
+     * @param offsets Arguments to construct offsets
+     * @param wires Arguments to construct wires
+     */
+    template <typename T1, typename T2, typename T3 = T2, typename T4>
+    explicit SparseHamiltonianMPI(T1 &&data, T2 &&indices, T3 &&offsets,
+                                  T4 &&wires)
+        : BaseType{data, indices, offsets, wires} {}
+
+    /**
+     * @brief Convenient wrapper for the constructor as the constructor does not
+     * convert the std::shared_ptr with a derived class correctly.
+     *
+     * This function is useful as std::make_shared does not handle
+     * brace-enclosed initializer list correctly.
+     *
+     * @param data Argument to construct data
+     * @param indices Argument to construct indices
+     * @param offsets Argument to construct ofsets
+     * @param wires Argument to construct wires
+     */
+    static auto create(std::initializer_list<ComplexT> data,
+                       std::initializer_list<IdxT> indices,
+                       std::initializer_list<IdxT> offsets,
+                       std::initializer_list<std::size_t> wires)
+        -> std::shared_ptr<SparseHamiltonianMPI<StateVectorT>> {
+        return std::shared_ptr<SparseHamiltonianMPI<StateVectorT>>(
+            new SparseHamiltonianMPI<StateVectorT>{
+                std::move(data), std::move(indices), std::move(offsets),
+                std::move(wires)});
+    }
+
+    /**
+     * @brief Updates the statevector SV:->SV', where SV' = a*H*SV, and where H
+     * is a sparse Hamiltonian.
+     *
+     */
+    void applyInPlace(StateVectorT &sv) const override {
+        auto mpi_manager = sv.getMPIManager();
+        if (mpi_manager.getRank() == 0) {
+            PL_ABORT_IF_NOT(
+                this->wires_.size() == sv.getTotalNumQubits(),
+                "SparseH wire count does not match state-vector size");
+        }
+        using CFP_t = typename StateVectorT::CFP_t;
+
+        auto device_id = sv.getDataBuffer().getDevTag().getDeviceID();
+        auto stream_id = sv.getDataBuffer().getDevTag().getStreamID();
+
+        const size_t length_local = size_t{1} << sv.getNumLocalQubits();
+
+        std::unique_ptr<DataBuffer<CFP_t>> d_sv_prime =
+            std::make_unique<DataBuffer<CFP_t>>(length_local, device_id,
+                                                stream_id, true);
+        d_sv_prime->zeroInit();
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager.Barrier();
+
+        cuUtil::SparseMV_cuSparseMPI<IdxT, PrecisionT, CFP_t>(
+            mpi_manager, length_local, this->offsets_.data(),
+            static_cast<int64_t>(this->offsets_.size()), this->indices_.data(),
+            this->data_.data(), const_cast<CFP_t *>(sv.getData()),
+            d_sv_prime->getData(), device_id, stream_id,
+            sv.getCusparseHandle());
+
+        sv.CopyGpuDataToGpuIn(d_sv_prime->getData(), d_sv_prime->getLength());
+        mpi_manager.Barrier();
+    }
+};
+
 } // namespace Pennylane::LightningGPU::Observables
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/tests/Test_ObservablesGPU.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/tests/Test_ObservablesGPU.cpp
index 0d8bd7d388..398f664ffc 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/tests/Test_ObservablesGPU.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/tests/Test_ObservablesGPU.cpp
@@ -154,6 +154,20 @@ TEMPLATE_PRODUCT_TEST_CASE("Hamiltonian", "[Observables]",
     }
 }
 
+TEMPLATE_PRODUCT_TEST_CASE("SparseHamiltonian", "[Observables]",
+                           (StateVectorCudaManaged), (float, double)) {
+    using StateVectorT = TestType;
+    using SparseHamiltonianT = SparseHamiltonian<StateVectorT>;
+
+    SECTION("Copy constructibility") {
+        REQUIRE(std::is_copy_constructible_v<SparseHamiltonianT>);
+    }
+
+    SECTION("Move constructibility") {
+        REQUIRE(std::is_move_constructible_v<SparseHamiltonianT>);
+    }
+}
+
 TEMPLATE_PRODUCT_TEST_CASE("Observables::HermitianHasher", "[Observables]",
                            (StateVectorCudaManaged), (float, double)) {
     using StateVectorT = TestType;
@@ -257,3 +271,31 @@ TEMPLATE_PRODUCT_TEST_CASE("Hamiltonian::ApplyInPlace", "[Observables]",
         }
     }
 }
+
+TEMPLATE_PRODUCT_TEST_CASE("SparseHamiltonian::ApplyInPlace", "[Observables]",
+                           (StateVectorCudaManaged), (float, double)) {
+    using StateVectorT = TestType;
+    using PrecisionT = typename StateVectorT::PrecisionT;
+    using ComplexT = typename StateVectorT::ComplexT;
+
+    const std::size_t num_qubits = 3;
+    std::mt19937 re{1337};
+
+    auto sparseH = SparseHamiltonian<StateVectorT>::create(
+        {ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+         ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+         ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}},
+        {7, 6, 5, 4, 3, 2, 1, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8}, {0, 1, 2});
+
+    auto init_state = createRandomStateVectorData<PrecisionT>(re, num_qubits);
+
+    StateVectorT state_vector(init_state.data(), init_state.size());
+
+    sparseH->applyInPlace(state_vector);
+
+    std::reverse(init_state.begin(), init_state.end());
+
+    REQUIRE(isApproxEqual(state_vector.getDataVector().data(),
+                          state_vector.getDataVector().size(),
+                          init_state.data(), init_state.size()));
+}
\ No newline at end of file
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/tests/mpi/Test_ObservablesGPUMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/tests/mpi/Test_ObservablesGPUMPI.cpp
index 6abde861e6..bc4d0e517f 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/observables/tests/mpi/Test_ObservablesGPUMPI.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/observables/tests/mpi/Test_ObservablesGPUMPI.cpp
@@ -289,4 +289,58 @@ TEMPLATE_PRODUCT_TEST_CASE("Observables::HermitianHasherMPI", "[Observables]",
         CHECK(ham_1->getObsName() == res1.str());
         CHECK(ham_2->getObsName() == res2.str());
     }
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("SparseHamiltonian::ApplyInPlace", "[Observables]",
+                           (StateVectorCudaMPI), (float, double)) {
+    using StateVectorT = TestType;
+    using PrecisionT = typename StateVectorT::PrecisionT;
+    using ComplexT = typename StateVectorT::ComplexT;
+    MPIManager mpi_manager(MPI_COMM_WORLD);
+
+    const std::size_t num_qubits = 3;
+    std::mt19937 re{1337};
+
+    auto sparseH = SparseHamiltonianMPI<StateVectorT>::create(
+        {ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+         ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0},
+         ComplexT{1.0, 0.0}, ComplexT{1.0, 0.0}},
+        {7, 6, 5, 4, 3, 2, 1, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8}, {0, 1, 2});
+
+    size_t mpi_buffersize = 1;
+    size_t nGlobalIndexBits =
+        std::bit_width(static_cast<size_t>(mpi_manager.getSize())) - 1;
+    size_t nLocalIndexBits = num_qubits - nGlobalIndexBits;
+    size_t subSvLength = 1 << nLocalIndexBits;
+
+    mpi_manager.Barrier();
+    std::vector<ComplexT> expected_sv(subSvLength);
+    std::vector<ComplexT> local_state(subSvLength);
+
+    auto init_state = createRandomStateVectorData<PrecisionT>(re, num_qubits);
+
+    mpi_manager.Scatter(init_state.data(), local_state.data(), subSvLength, 0);
+    mpi_manager.Barrier();
+
+    int nDevices = 0;
+    cudaGetDeviceCount(&nDevices);
+    int deviceId = mpi_manager.getRank() % nDevices;
+    cudaSetDevice(deviceId);
+    DevTag<int> dt_local(deviceId, 0);
+    mpi_manager.Barrier();
+
+    StateVectorT sv_mpi(mpi_manager, dt_local, mpi_buffersize, nGlobalIndexBits,
+                        nLocalIndexBits);
+
+    sv_mpi.CopyHostDataToGpu(local_state, false);
+
+    sparseH->applyInPlace(sv_mpi);
+
+    std::reverse(init_state.begin(), init_state.end());
+    mpi_manager.Scatter(init_state.data(), expected_sv.data(), subSvLength, 0);
+    mpi_manager.Barrier();
+
+    REQUIRE(isApproxEqual(sv_mpi.getDataVector().data(),
+                          sv_mpi.getDataVector().size(), expected_sv.data(),
+                          expected_sv.size()));
 }
\ No newline at end of file
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/LinearAlg.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/LinearAlg.hpp
index f70d4ea9f2..13a7ec9a90 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/LinearAlg.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/LinearAlg.hpp
@@ -307,20 +307,19 @@ inline SharedCusparseHandle make_shared_cusparse_handle() {
  * @param handle cuSparse handle.
  */
 template <class index_type, class Precision, class CFP_t, class DevTypeID = int>
-inline void SparseMV_cuSparse(const index_type *csrOffsets_ptr,
-                              const index_type csrOffsets_size,
-                              const index_type *columns_ptr,
-                              const std::complex<Precision> *values_ptr,
-                              const index_type numNNZ, CFP_t *X, CFP_t *Y,
-                              DevTypeID device_id, cudaStream_t stream_id,
-                              cusparseHandle_t handle) {
-    const int64_t num_rows = static_cast<int64_t>(
+inline void
+SparseMV_cuSparse(const index_type *csrOffsets_ptr,
+                  const int64_t csrOffsets_size, const index_type *columns_ptr,
+                  const std::complex<Precision> *values_ptr,
+                  const int64_t numNNZ, CFP_t *X, CFP_t *Y, DevTypeID device_id,
+                  cudaStream_t stream_id, cusparseHandle_t handle) {
+    const int64_t num_rows =
         csrOffsets_size -
-        1); // int64_t is required for num_rows by cusparseCreateCsr
-    const int64_t num_cols = static_cast<int64_t>(
-        num_rows); // int64_t is required for num_cols by cusparseCreateCsr
-    const int64_t nnz = static_cast<int64_t>(
-        numNNZ); // int64_t is required for nnz by cusparseCreateCsr
+        1; // int64_t is required for num_rows by cusparseCreateCsr
+    const int64_t num_cols =
+        num_rows; // int64_t is required for num_cols by cusparseCreateCsr
+    const int64_t nnz =
+        numNNZ; // int64_t is required for nnz by cusparseCreateCsr
 
     const CFP_t alpha = {1.0, 0.0};
     const CFP_t beta = {0.0, 0.0};
@@ -338,13 +337,15 @@ inline void SparseMV_cuSparse(const index_type *csrOffsets_ptr,
     d_values.CopyHostDataToGpu(values_ptr, d_values.getLength(), false);
 
     cudaDataType_t data_type;
-    cusparseIndexType_t compute_type = CUSPARSE_INDEX_64I;
+    cusparseIndexType_t compute_type;
 
     if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
                   std::is_same_v<CFP_t, double2>) {
         data_type = CUDA_C_64F;
+        compute_type = CUSPARSE_INDEX_64I;
     } else {
         data_type = CUDA_C_32F;
+        compute_type = CUSPARSE_INDEX_32I;
     }
 
     // CUSPARSE APIs
@@ -394,8 +395,7 @@ inline void SparseMV_cuSparse(const index_type *csrOffsets_ptr,
         /* cusparseSpMVAlg_t */ CUSPARSE_SPMV_ALG_DEFAULT,
         /* size_t* */ &bufferSize));
 
-    DataBuffer<cudaDataType_t, int> dBuffer{bufferSize, device_id, stream_id,
-                                            true};
+    DataBuffer<CFP_t, int> dBuffer{bufferSize, device_id, stream_id, true};
 
     // execute SpMV
     PL_CUSPARSE_IS_SUCCESS(cusparseSpMV(
@@ -439,19 +439,19 @@ inline void SparseMV_cuSparse(const index_type *csrOffsets_ptr,
  */
 template <class index_type, class Precision, class CFP_t, class DevTypeID = int>
 inline void SparseMV_cuSparse(const index_type *csrOffsets_ptr,
-                              const index_type csrOffsets_size,
+                              const int64_t csrOffsets_size,
                               const index_type *columns_ptr,
                               const std::complex<Precision> *values_ptr,
-                              const index_type numNNZ, const CFP_t *X, CFP_t *Y,
+                              const int64_t numNNZ, const CFP_t *X, CFP_t *Y,
                               DevTypeID device_id, cudaStream_t stream_id,
                               cusparseHandle_t handle) {
-    const int64_t num_rows = static_cast<int64_t>(
+    const int64_t num_rows =
         csrOffsets_size -
-        1); // int64_t is required for num_rows by cusparseCreateCsr
-    const int64_t num_cols = static_cast<int64_t>(
-        num_rows); // int64_t is required for num_cols by cusparseCreateCsr
-    const int64_t nnz = static_cast<int64_t>(
-        numNNZ); // int64_t is required for nnz by cusparseCreateCsr
+        1; // int64_t is required for num_rows by cusparseCreateCsr
+    const int64_t num_cols =
+        num_rows; // int64_t is required for num_cols by cusparseCreateCsr
+    const int64_t nnz =
+        numNNZ; // int64_t is required for nnz by cusparseCreateCsr
 
     const CFP_t alpha = {1.0, 0.0};
     const CFP_t beta = {0.0, 0.0};
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/MPILinearAlg.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/MPILinearAlg.hpp
index 4b2e905b8e..cd2afd426b 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/MPILinearAlg.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/MPILinearAlg.hpp
@@ -45,15 +45,15 @@ namespace Pennylane::LightningGPU::Util {
 template <class index_type, class Precision, class CFP_t, class DevTypeID = int>
 inline void SparseMV_cuSparseMPI(
     MPIManager &mpi_manager, const size_t &length_local,
-    const index_type *csrOffsets_ptr, const index_type csrOffsets_size,
+    const index_type *csrOffsets_ptr, const int64_t csrOffsets_size,
     const index_type *columns_ptr, const std::complex<Precision> *values_ptr,
     CFP_t *X, CFP_t *Y, DevTypeID device_id, cudaStream_t stream_id,
     cusparseHandle_t handle) {
     std::vector<std::vector<CSRMatrix<Precision, index_type>>> csrmatrix_blocks;
     if (mpi_manager.getRank() == 0) {
         csrmatrix_blocks = splitCSRMatrix<Precision, index_type>(
-            mpi_manager, csrOffsets_size - 1, csrOffsets_ptr, columns_ptr,
-            values_ptr);
+            mpi_manager, static_cast<size_t>(csrOffsets_size - 1),
+            csrOffsets_ptr, columns_ptr, values_ptr);
     }
     mpi_manager.Barrier();
 
@@ -79,11 +79,11 @@ inline void SparseMV_cuSparseMPI(
             color = 1;
             SparseMV_cuSparse<index_type, Precision, CFP_t>(
                 localCSRMatrix.getCsrOffsets().data(),
-                localCSRMatrix.getCsrOffsets().size(),
+                static_cast<int64_t>(localCSRMatrix.getCsrOffsets().size()),
                 localCSRMatrix.getColumns().data(),
                 localCSRMatrix.getValues().data(),
-                localCSRMatrix.getValues().size(), X, d_res_per_block.getData(),
-                device_id, stream_id, handle);
+                static_cast<int64_t>(localCSRMatrix.getValues().size()), X,
+                d_res_per_block.getData(), device_id, stream_id, handle);
         }
 
         PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/tests/Test_LinearAlgebra.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/tests/Test_LinearAlgebra.cpp
index cd1dd3937a..a2b35d0742 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/tests/Test_LinearAlgebra.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/tests/Test_LinearAlgebra.cpp
@@ -40,6 +40,8 @@ TEMPLATE_TEST_CASE("Linear Algebra::SparseMV", "[Linear Algebra]", float,
     using StateVectorT = StateVectorCudaManaged<TestType>;
     using ComplexT = StateVectorT::ComplexT;
     using CFP_t = StateVectorT::CFP_t;
+    using IdxT = typename std::conditional<std::is_same<TestType, float>::value,
+                                           int32_t, int64_t>::type;
 
     std::size_t num_qubits = 3;
     std::size_t data_size = exp2(num_qubits);
@@ -52,9 +54,9 @@ TEMPLATE_TEST_CASE("Linear Algebra::SparseMV", "[Linear Algebra]", float,
         {0.2, -0.1}, {-0.1, 0.2}, {0.2, 0.1}, {0.1, 0.2},
         {0.7, -0.2}, {-0.1, 0.6}, {0.6, 0.1}, {0.2, 0.7}};
 
-    std::vector<size_t> indptr = {0, 2, 4, 6, 8, 10, 12, 14, 16};
-    std::vector<size_t> indices = {0, 3, 1, 2, 1, 2, 0, 3,
-                                   4, 7, 5, 6, 5, 6, 4, 7};
+    std::vector<IdxT> indptr = {0, 2, 4, 6, 8, 10, 12, 14, 16};
+    std::vector<IdxT> indices = {0, 3, 1, 2, 1, 2, 0, 3,
+                                 4, 7, 5, 6, 5, 6, 4, 7};
     std::vector<ComplexT> values = {
         {1.0, 0.0},  {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0},
         {0.0, -1.0}, {1.0, 0.0},  {0.0, 1.0}, {1.0, 0.0},
@@ -69,10 +71,10 @@ TEMPLATE_TEST_CASE("Linear Algebra::SparseMV", "[Linear Algebra]", float,
     SECTION("Testing sparse matrix vector product:") {
         std::vector<ComplexT> result(data_size);
 
-        cuUtil::SparseMV_cuSparse<size_t, TestType, CFP_t>(
-            indptr.data(), indptr.size(), indices.data(), values.data(),
-            values.size(), sv_x.getData(), sv_y.getData(),
-            sv_x.getDataBuffer().getDevTag().getDeviceID(),
+        cuUtil::SparseMV_cuSparse<IdxT, TestType, CFP_t>(
+            indptr.data(), static_cast<int64_t>(indptr.size()), indices.data(),
+            values.data(), static_cast<int64_t>(values.size()), sv_x.getData(),
+            sv_y.getData(), sv_x.getDataBuffer().getDevTag().getDeviceID(),
             sv_x.getDataBuffer().getDevTag().getStreamID(),
             sv_x.getCusparseHandle());
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/tests/mpi/Test_LinearAlgebraMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/tests/mpi/Test_LinearAlgebraMPI.cpp
index 64df9d77e8..371bc66f75 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/tests/mpi/Test_LinearAlgebraMPI.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/tests/mpi/Test_LinearAlgebraMPI.cpp
@@ -40,6 +40,8 @@ TEMPLATE_TEST_CASE("Linear Algebra::SparseMV", "[Linear Algebra]", float,
     using StateVectorT = StateVectorCudaMPI<TestType>;
     using ComplexT = StateVectorT::ComplexT;
     using CFP_t = StateVectorT::CFP_t;
+    using IdxT = typename std::conditional<std::is_same<TestType, float>::value,
+                                           int32_t, int64_t>::type;
 
     MPIManager mpi_manager(MPI_COMM_WORLD);
     REQUIRE(mpi_manager.getSize() == 2);
@@ -54,9 +56,9 @@ TEMPLATE_TEST_CASE("Linear Algebra::SparseMV", "[Linear Algebra]", float,
                                          {0.1, 0.2},  {0.7, -0.2}, {-0.1, 0.6},
                                          {0.6, 0.1},  {0.2, 0.7}};
 
-    std::vector<size_t> indptr = {0, 2, 4, 6, 8, 10, 12, 14, 16};
-    std::vector<size_t> indices = {0, 3, 1, 2, 1, 2, 0, 3,
-                                   4, 7, 5, 6, 5, 6, 4, 7};
+    std::vector<IdxT> indptr = {0, 2, 4, 6, 8, 10, 12, 14, 16};
+    std::vector<IdxT> indices = {0, 3, 1, 2, 1, 2, 0, 3,
+                                 4, 7, 5, 6, 5, 6, 4, 7};
     std::vector<ComplexT> values = {
         {1.0, 0.0},  {0.0, -1.0}, {1.0, 0.0}, {0.0, 1.0},
         {0.0, -1.0}, {1.0, 0.0},  {0.0, 1.0}, {1.0, 0.0},
@@ -95,9 +97,10 @@ TEMPLATE_TEST_CASE("Linear Algebra::SparseMV", "[Linear Algebra]", float,
                           nGlobalIndexBits, nLocalIndexBits);
         sv_x.CopyHostDataToGpu(local_state, false);
 
-        cuUtil::SparseMV_cuSparseMPI<size_t, TestType, CFP_t>(
-            mpi_manager, sv_x.getLength(), indptr.data(), indptr.size(),
-            indices.data(), values.data(), sv_x.getData(), sv_y.getData(),
+        cuUtil::SparseMV_cuSparseMPI<IdxT, TestType, CFP_t>(
+            mpi_manager, sv_x.getLength(), indptr.data(),
+            static_cast<int64_t>(indptr.size()), indices.data(), values.data(),
+            sv_x.getData(), sv_y.getData(),
             sv_x.getDataBuffer().getDevTag().getDeviceID(),
             sv_x.getDataBuffer().getDevTag().getStreamID(),
             sv_x.getCusparseHandle());
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp
index a3f0951b24..02064e811f 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp
@@ -17,6 +17,7 @@
  */
 
 #pragma once
+#include <complex>
 #include <cstddef>
 #include <cstdlib>
 #include <string>
@@ -170,7 +171,7 @@ class StateVectorKokkos final
      *
      * @param num_qubits Number of qubits
      */
-    StateVectorKokkos(ComplexT *hostdata_, size_t length,
+    StateVectorKokkos(ComplexT *hostdata_, std::size_t length,
                       const Kokkos::InitializationSettings &kokkos_args = {})
         : StateVectorKokkos(log2(length), kokkos_args) {
         PL_ABORT_IF_NOT(isPerfectPowerOf2(length),
@@ -178,12 +179,20 @@ class StateVectorKokkos final
         HostToDevice(hostdata_, length);
     }
 
+    StateVectorKokkos(std::complex<PrecisionT> *hostdata_, std::size_t length,
+                      const Kokkos::InitializationSettings &kokkos_args = {})
+        : StateVectorKokkos(log2(length), kokkos_args) {
+        PL_ABORT_IF_NOT(isPerfectPowerOf2(length),
+                        "The size of provided data must be a power of 2.");
+        HostToDevice(reinterpret_cast<ComplexT *>(hostdata_), length);
+    }
+
     /**
      * @brief Create a new state vector from data on the host.
      *
      * @param num_qubits Number of qubits
      */
-    StateVectorKokkos(const ComplexT *hostdata_, size_t length,
+    StateVectorKokkos(const ComplexT *hostdata_, std::size_t length,
                       const Kokkos::InitializationSettings &kokkos_args = {})
         : StateVectorKokkos(log2(length), kokkos_args) {
         PL_ABORT_IF_NOT(isPerfectPowerOf2(length),
@@ -692,7 +701,7 @@ class StateVectorKokkos final
      * @param new_data data pointer to new data.
      * @param new_size size of underlying data storage.
      */
-    void updateData(ComplexT *new_data, size_t new_size) {
+    void updateData(ComplexT *new_data, std::size_t new_size) {
         updateData(KokkosVector(new_data, new_size));
     }
 
@@ -744,7 +753,7 @@ class StateVectorKokkos final
      * @brief Copy data from the host space to the device space.
      *
      */
-    inline void HostToDevice(ComplexT *sv, size_t length) {
+    inline void HostToDevice(ComplexT *sv, std::size_t length) {
         Kokkos::deep_copy(*data_, UnmanagedComplexHostView(sv, length));
     }
 
@@ -752,7 +761,7 @@ class StateVectorKokkos final
      * @brief Copy data from the device space to the host space.
      *
      */
-    inline void DeviceToHost(ComplexT *sv, size_t length) const {
+    inline void DeviceToHost(ComplexT *sv, std::size_t length) const {
         Kokkos::deep_copy(UnmanagedComplexHostView(sv, length), *data_);
     }
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/bindings/LKokkosBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/bindings/LKokkosBindings.hpp
index bd9d89d72f..6432864c4e 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/bindings/LKokkosBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/bindings/LKokkosBindings.hpp
@@ -24,6 +24,7 @@
 #include "ConstantUtil.hpp" // lookup
 #include "GateOperation.hpp"
 #include "MeasurementsKokkos.hpp"
+#include "ObservablesKokkos.hpp"
 #include "StateVectorKokkos.hpp"
 #include "TypeList.hpp"
 #include "Util.hpp" // exp2
@@ -33,6 +34,7 @@ namespace {
 using namespace Pennylane::Bindings;
 using namespace Pennylane::LightningKokkos::Algorithms;
 using namespace Pennylane::LightningKokkos::Measures;
+using namespace Pennylane::LightningKokkos::Observables;
 using Kokkos::InitializationSettings;
 using Pennylane::LightningKokkos::StateVectorKokkos;
 using Pennylane::Util::exp2;
@@ -214,6 +216,58 @@ void registerBackendSpecificMeasurements(PyClass &pyclass) {
             "Variance of a sparse Hamiltonian.");
 }
 
+/**
+ * @brief Register observable classes.
+ *
+ * @tparam StateVectorT
+ * @param m Pybind module
+ */
+template <class StateVectorT>
+void registerBackendSpecificObservables(py::module_ &m) {
+    using PrecisionT =
+        typename StateVectorT::PrecisionT; // Statevector's precision.
+    using ParamT = PrecisionT;             // Parameter's data precision
+
+    const std::string bitsize =
+        std::to_string(sizeof(std::complex<PrecisionT>) * 8);
+
+    using np_arr_c = py::array_t<std::complex<ParamT>, py::array::c_style>;
+
+    std::string class_name;
+
+    class_name = "SparseHamiltonianC" + bitsize;
+    py::class_<SparseHamiltonian<StateVectorT>,
+               std::shared_ptr<SparseHamiltonian<StateVectorT>>,
+               Observable<StateVectorT>>(m, class_name.c_str(),
+                                         py::module_local())
+        .def(py::init([](const np_arr_c &data,
+                         const std::vector<std::size_t> &indices,
+                         const std::vector<std::size_t> &indptr,
+                         const std::vector<std::size_t> &wires) {
+            using ComplexT = typename StateVectorT::ComplexT;
+            const py::buffer_info buffer_data = data.request();
+            const auto *data_ptr = static_cast<ComplexT *>(buffer_data.ptr);
+
+            return SparseHamiltonian<StateVectorT>{
+                std::vector<ComplexT>({data_ptr, data_ptr + data.size()}),
+                indices, indptr, wires};
+        }))
+        .def("__repr__", &SparseHamiltonian<StateVectorT>::getObsName)
+        .def("get_wires", &SparseHamiltonian<StateVectorT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const SparseHamiltonian<StateVectorT> &self,
+               py::handle other) -> bool {
+                if (!py::isinstance<SparseHamiltonian<StateVectorT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<SparseHamiltonian<StateVectorT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+}
+
 /**
  * @brief Register backend specific adjoint Jacobian methods.
  *
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.cpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.cpp
index d90f3e6019..66192b934a 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.cpp
@@ -28,3 +28,6 @@ template class Observables::TensorProdObs<StateVectorKokkos<double>>;
 
 template class Observables::Hamiltonian<StateVectorKokkos<float>>;
 template class Observables::Hamiltonian<StateVectorKokkos<double>>;
+
+template class Observables::SparseHamiltonian<StateVectorKokkos<float>>;
+template class Observables::SparseHamiltonian<StateVectorKokkos<double>>;
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.hpp
index a0371df7f2..c3fae6b3ea 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.hpp
@@ -31,6 +31,7 @@ namespace {
 using namespace Pennylane::Util;
 using namespace Pennylane::Observables;
 using Pennylane::LightningKokkos::StateVectorKokkos;
+using Pennylane::LightningKokkos::Util::SparseMV_Kokkos;
 } // namespace
 /// @endcond
 
@@ -199,6 +200,76 @@ class Hamiltonian final : public HamiltonianBase<StateVectorT> {
     }
 };
 
+/**
+ * @brief Sparse representation of Hamiltonian<StateVectorT>
+ *
+ */
+template <class StateVectorT>
+class SparseHamiltonian final : public SparseHamiltonianBase<StateVectorT> {
+  private:
+    using BaseType = SparseHamiltonianBase<StateVectorT>;
+
+  public:
+    using PrecisionT = typename StateVectorT::PrecisionT;
+    using ComplexT = typename StateVectorT::ComplexT;
+    using IdxT = typename BaseType::IdxT;
+
+    /**
+     * @brief Create a SparseHamiltonian from data, indices and offsets in CSR
+     * format.
+     *
+     * @param data Arguments to construct data
+     * @param indices Arguments to construct indices
+     * @param offsets Arguments to construct offsets
+     * @param wires Arguments to construct wires
+     */
+    template <typename T1, typename T2, typename T3 = T2, typename T4>
+    explicit SparseHamiltonian(T1 &&data, T2 &&indices, T3 &&offsets,
+                               T4 &&wires)
+        : BaseType{data, indices, offsets, wires} {}
+
+    /**
+     * @brief Convenient wrapper for the constructor as the constructor does not
+     * convert the std::shared_ptr with a derived class correctly.
+     *
+     * This function is useful as std::make_shared does not handle
+     * brace-enclosed initializer list correctly.
+     *
+     * @param data Argument to construct data
+     * @param indices Argument to construct indices
+     * @param offsets Argument to construct ofsets
+     * @param wires Argument to construct wires
+     */
+    static auto create(std::initializer_list<ComplexT> data,
+                       std::initializer_list<IdxT> indices,
+                       std::initializer_list<IdxT> offsets,
+                       std::initializer_list<std::size_t> wires)
+        -> std::shared_ptr<SparseHamiltonian<StateVectorT>> {
+        return std::shared_ptr<SparseHamiltonian<StateVectorT>>(
+            new SparseHamiltonian<StateVectorT>{
+                std::move(data), std::move(indices), std::move(offsets),
+                std::move(wires)});
+    }
+
+    /**
+     * @brief Updates the statevector SV:->SV', where SV' = a*H*SV, and where H
+     * is a sparse Hamiltonian.
+     *
+     */
+    void applyInPlace(StateVectorT &sv) const override {
+        PL_ABORT_IF_NOT(this->wires_.size() == sv.getNumQubits(),
+                        "SparseH wire count does not match state-vector size");
+        StateVectorT d_sv_prime(sv.getNumQubits());
+
+        SparseMV_Kokkos<PrecisionT, ComplexT>(
+            sv.getView(), d_sv_prime.getView(), this->offsets_.data(),
+            this->offsets_.size(), this->indices_.data(), this->data_.data(),
+            this->data_.size());
+
+        sv.updateData(d_sv_prime);
+    }
+};
+
 /// @cond DEV
 namespace detail {
 using Pennylane::LightningKokkos::Util::axpy_Kokkos;
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/tests/Test_ObservablesKokkos.cpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/tests/Test_ObservablesKokkos.cpp
index aa616d2e33..5d402b1e2d 100644
--- a/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/tests/Test_ObservablesKokkos.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_kokkos/observables/tests/Test_ObservablesKokkos.cpp
@@ -153,6 +153,20 @@ TEMPLATE_PRODUCT_TEST_CASE("Hamiltonian", "[Observables]", (StateVectorKokkos),
     }
 }
 
+TEMPLATE_PRODUCT_TEST_CASE("SparseHamiltonian", "[Observables]",
+                           (StateVectorKokkos), (float, double)) {
+    using StateVectorT = TestType;
+    using SparseHamiltonianT = SparseHamiltonian<StateVectorT>;
+
+    SECTION("Copy constructibility") {
+        REQUIRE(std::is_copy_constructible_v<SparseHamiltonianT>);
+    }
+
+    SECTION("Move constructibility") {
+        REQUIRE(std::is_move_constructible_v<SparseHamiltonianT>);
+    }
+}
+
 TEMPLATE_PRODUCT_TEST_CASE("Hamiltonian::ApplyInPlace", "[Observables]",
                            (StateVectorKokkos), (float, double)) {
     using StateVectorT = TestType;
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/bindings/LQubitBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/bindings/LQubitBindings.hpp
index 190a9c6525..2f83f2f39d 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/bindings/LQubitBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/bindings/LQubitBindings.hpp
@@ -25,15 +25,17 @@
 #include "DynamicDispatcher.hpp"
 #include "GateOperation.hpp"
 #include "MeasurementsLQubit.hpp"
+#include "ObservablesLQubit.hpp"
 #include "StateVectorLQubitRaw.hpp"
 #include "TypeList.hpp"
 #include "VectorJacobianProduct.hpp"
 
 /// @cond DEV
 namespace {
-using namespace Pennylane::LightningQubit::Measures;
-using namespace Pennylane::LightningQubit::Algorithms;
 using namespace Pennylane::Bindings;
+using namespace Pennylane::LightningQubit::Algorithms;
+using namespace Pennylane::LightningQubit::Measures;
+using namespace Pennylane::LightningQubit::Observables;
 using Pennylane::LightningQubit::StateVectorLQubitRaw;
 } // namespace
 /// @endcond
@@ -180,6 +182,58 @@ void registerBackendSpecificMeasurements(PyClass &pyclass) {
              });
 }
 
+/**
+ * @brief Register backend specific observables.
+ *
+ * @tparam StateVectorT
+ * @param m Pybind module
+ */
+template <class StateVectorT>
+void registerBackendSpecificObservables([[maybe_unused]] py::module_ &m) {
+    using PrecisionT =
+        typename StateVectorT::PrecisionT; // Statevector's precision.
+    using ParamT = PrecisionT;             // Parameter's data precision
+
+    const std::string bitsize =
+        std::to_string(sizeof(std::complex<PrecisionT>) * 8);
+
+    using np_arr_c = py::array_t<std::complex<ParamT>, py::array::c_style>;
+
+    std::string class_name;
+
+    class_name = "SparseHamiltonianC" + bitsize;
+    py::class_<SparseHamiltonian<StateVectorT>,
+               std::shared_ptr<SparseHamiltonian<StateVectorT>>,
+               Observable<StateVectorT>>(m, class_name.c_str(),
+                                         py::module_local())
+        .def(py::init([](const np_arr_c &data,
+                         const std::vector<std::size_t> &indices,
+                         const std::vector<std::size_t> &indptr,
+                         const std::vector<std::size_t> &wires) {
+            using ComplexT = typename StateVectorT::ComplexT;
+            const py::buffer_info buffer_data = data.request();
+            const auto *data_ptr = static_cast<ComplexT *>(buffer_data.ptr);
+
+            return SparseHamiltonian<StateVectorT>{
+                std::vector<ComplexT>({data_ptr, data_ptr + data.size()}),
+                indices, indptr, wires};
+        }))
+        .def("__repr__", &SparseHamiltonian<StateVectorT>::getObsName)
+        .def("get_wires", &SparseHamiltonian<StateVectorT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const SparseHamiltonian<StateVectorT> &self,
+               py::handle other) -> bool {
+                if (!py::isinstance<SparseHamiltonian<StateVectorT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<SparseHamiltonian<StateVectorT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+}
+
 /**
  * @brief Register Vector Jacobian Product.
  */
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/observables/ObservablesLQubit.cpp b/pennylane_lightning/core/src/simulators/lightning_qubit/observables/ObservablesLQubit.cpp
index 0a1fb54c48..e45e2c1572 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/observables/ObservablesLQubit.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/observables/ObservablesLQubit.cpp
@@ -41,3 +41,9 @@ template class Observables::Hamiltonian<StateVectorLQubitRaw<double>>;
 
 template class Observables::Hamiltonian<StateVectorLQubitManaged<float>>;
 template class Observables::Hamiltonian<StateVectorLQubitManaged<double>>;
+
+template class Observables::SparseHamiltonian<StateVectorLQubitRaw<float>>;
+template class Observables::SparseHamiltonian<StateVectorLQubitRaw<double>>;
+
+template class Observables::SparseHamiltonian<StateVectorLQubitManaged<float>>;
+template class Observables::SparseHamiltonian<StateVectorLQubitManaged<double>>;
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/observables/ObservablesLQubit.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/observables/ObservablesLQubit.hpp
index 3433a3fcc3..b659969037 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/observables/ObservablesLQubit.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/observables/ObservablesLQubit.hpp
@@ -27,6 +27,7 @@
 #include "LinearAlgebra.hpp" // scaleAndAdd
 #include "Macros.hpp"        // use_openmp
 #include "Observables.hpp"
+#include "SparseLinAlg.hpp"
 #include "StateVectorLQubitManaged.hpp"
 #include "StateVectorLQubitRaw.hpp"
 #include "Util.hpp"
@@ -359,4 +360,74 @@ class Hamiltonian final : public HamiltonianBase<StateVectorT> {
     }
 };
 
+/**
+ * @brief Sparse representation of Hamiltonian<StateVectorT>
+ *
+ */
+template <class StateVectorT>
+class SparseHamiltonian final : public SparseHamiltonianBase<StateVectorT> {
+  private:
+    using BaseType = SparseHamiltonianBase<StateVectorT>;
+
+  public:
+    using PrecisionT = typename StateVectorT::PrecisionT;
+    using ComplexT = typename StateVectorT::ComplexT;
+    using IdxT = typename BaseType::IdxT;
+
+    /**
+     * @brief Create a SparseHamiltonian from data, indices and offsets in CSR
+     * format.
+     *
+     * @param data Arguments to construct data
+     * @param indices Arguments to construct indices
+     * @param offsets Arguments to construct offsets
+     * @param wires Arguments to construct wires
+     */
+    template <typename T1, typename T2, typename T3 = T2, typename T4>
+    explicit SparseHamiltonian(T1 &&data, T2 &&indices, T3 &&offsets,
+                               T4 &&wires)
+        : BaseType{data, indices, offsets, wires} {}
+
+    /**
+     * @brief Convenient wrapper for the constructor as the constructor does not
+     * convert the std::shared_ptr with a derived class correctly.
+     *
+     * This function is useful as std::make_shared does not handle
+     * brace-enclosed initializer list correctly.
+     *
+     * @param data Argument to construct data
+     * @param indices Argument to construct indices
+     * @param offsets Argument to construct ofsets
+     * @param wires Argument to construct wires
+     */
+    static auto create(std::initializer_list<ComplexT> data,
+                       std::initializer_list<IdxT> indices,
+                       std::initializer_list<IdxT> offsets,
+                       std::initializer_list<std::size_t> wires)
+        -> std::shared_ptr<SparseHamiltonian<StateVectorT>> {
+        // NOLINTBEGIN(*-move-const-arg)
+        return std::shared_ptr<SparseHamiltonian<StateVectorT>>(
+            new SparseHamiltonian<StateVectorT>{
+                std::move(data), std::move(indices), std::move(offsets),
+                std::move(wires)});
+        // NOLINTEND(*-move-const-arg)
+    }
+
+    /**
+     * @brief Updates the statevector SV:->SV', where SV' = a*H*SV, and where H
+     * is a sparse Hamiltonian.
+     *
+     */
+    void applyInPlace(StateVectorT &sv) const override {
+        PL_ABORT_IF_NOT(this->wires_.size() == sv.getNumQubits(),
+                        "SparseH wire count does not match state-vector size");
+        auto operator_vector = Util::apply_Sparse_Matrix(
+            sv.getData(), sv.getLength(), this->offsets_.data(),
+            this->offsets_.size(), this->indices_.data(), this->data_.data(),
+            this->data_.size());
+
+        sv.updateData(operator_vector);
+    }
+};
+
 } // namespace Pennylane::LightningQubit::Observables
\ No newline at end of file
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/observables/tests/Test_ObservablesLQubit.cpp b/pennylane_lightning/core/src/simulators/lightning_qubit/observables/tests/Test_ObservablesLQubit.cpp
index 624248da20..4ef59b04c5 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/observables/tests/Test_ObservablesLQubit.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/observables/tests/Test_ObservablesLQubit.cpp
@@ -159,6 +159,21 @@ TEMPLATE_PRODUCT_TEST_CASE("Hamiltonian", "[Observables]",
     }
 }
 
+TEMPLATE_PRODUCT_TEST_CASE("SparseHamiltonian", "[Observables]",
+                           (StateVectorLQubitManaged, StateVectorLQubitRaw),
+                           (float, double)) {
+    using StateVectorT = TestType;
+    using SparseHamiltonianT = SparseHamiltonian<StateVectorT>;
+
+    SECTION("Copy constructibility") {
+        REQUIRE(std::is_copy_constructible_v<SparseHamiltonianT>);
+    }
+
+    SECTION("Move constructibility") {
+        REQUIRE(std::is_move_constructible_v<SparseHamiltonianT>);
+    }
+}
+
 TEMPLATE_PRODUCT_TEST_CASE("Hamiltonian::ApplyInPlace", "[Observables]",
                            (StateVectorLQubitManaged, StateVectorLQubitRaw),
                            (float, double)) {
diff --git a/pennylane_lightning/core/src/utils/TestHelpers.hpp b/pennylane_lightning/core/src/utils/TestHelpers.hpp
index 605e584bcb..af1a46618a 100644
--- a/pennylane_lightning/core/src/utils/TestHelpers.hpp
+++ b/pennylane_lightning/core/src/utils/TestHelpers.hpp
@@ -429,7 +429,8 @@ void write_CSR_vectors(std::vector<IndexT> &row_map,
     const ComplexT SC_ONE = 1.0;
 
     row_map.resize(numRows + 1);
-    for (IndexT rowIdx = 1; rowIdx < (IndexT)row_map.size(); ++rowIdx) {
+    for (IndexT rowIdx = 1; rowIdx < static_cast<IndexT>(row_map.size());
+         ++rowIdx) {
         row_map[rowIdx] = row_map[rowIdx - 1] + 3;
     };
     const IndexT numNNZ = row_map[numRows];
@@ -437,6 +438,7 @@ void write_CSR_vectors(std::vector<IndexT> &row_map,
     entries.resize(numNNZ);
     values.resize(numNNZ);
     for (IndexT rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+        size_t idx = row_map[rowIdx];
         if (rowIdx == 0) {
             entries[0] = rowIdx;
             entries[1] = rowIdx + 1;
@@ -446,21 +448,21 @@ void write_CSR_vectors(std::vector<IndexT> &row_map,
             values[1] = -SC_ONE;
             values[2] = -SC_ONE;
         } else if (rowIdx == numRows - 1) {
-            entries[row_map[rowIdx]] = 0;
-            entries[row_map[rowIdx] + 1] = rowIdx - 1;
-            entries[row_map[rowIdx] + 2] = rowIdx;
+            entries[idx] = 0;
+            entries[idx + 1] = rowIdx - 1;
+            entries[idx + 2] = rowIdx;
 
-            values[row_map[rowIdx]] = -SC_ONE;
-            values[row_map[rowIdx] + 1] = -SC_ONE;
-            values[row_map[rowIdx] + 2] = SC_ONE;
+            values[idx] = -SC_ONE;
+            values[idx + 1] = -SC_ONE;
+            values[idx + 2] = SC_ONE;
         } else {
-            entries[row_map[rowIdx]] = rowIdx - 1;
-            entries[row_map[rowIdx] + 1] = rowIdx;
-            entries[row_map[rowIdx] + 2] = rowIdx + 1;
+            entries[idx] = rowIdx - 1;
+            entries[idx + 1] = rowIdx;
+            entries[idx + 2] = rowIdx + 1;
 
-            values[row_map[rowIdx]] = -SC_ONE;
-            values[row_map[rowIdx] + 1] = SC_ONE;
-            values[row_map[rowIdx] + 2] = -SC_ONE;
+            values[idx] = -SC_ONE;
+            values[idx + 1] = SC_ONE;
+            values[idx + 2] = -SC_ONE;
         }
     }
 };
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index e5b85eea2f..98de0e9512 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -40,6 +40,21 @@
         DevPool,
     )
 
+    try:
+        # pylint: disable=no-name-in-module
+        from pennylane_lightning.lightning_gpu_ops import (
+            StateVectorMPIC128,
+            StateVectorMPIC64,
+            MeasurementsMPIC128,
+            MeasurementsMPIC64,
+            MPIManager,
+            DevTag,
+        )
+
+        MPI_SUPPORT = True
+    except ImportError:
+        MPI_SUPPORT = False
+
     from ctypes.util import find_library
     from importlib import util as imp_util
 
@@ -91,11 +106,29 @@
         create_ops_listC128,
     )
 
-    def _gpu_dtype(dtype):
+    if MPI_SUPPORT:
+        from pennylane_lightning.lightning_gpu_ops.algorithmsMPI import (
+            AdjointJacobianMPIC64,
+            create_ops_listMPIC64,
+            AdjointJacobianMPIC128,
+            create_ops_listMPIC128,
+        )
+
+    def _gpu_dtype(dtype, mpi=False):
         if dtype not in [np.complex128, np.complex64]:  # pragma: no cover
             raise ValueError(f"Data type is not supported for state-vector computation: {dtype}")
+        if mpi:
+            return StateVectorMPIC128 if dtype == np.complex128 else StateVectorMPIC64
         return StateVectorC128 if dtype == np.complex128 else StateVectorC64
 
+    def _adj_dtype(use_csingle, mpi=False):
+        if mpi:
+            return AdjointJacobianMPIC64 if use_csingle else AdjointJacobianMPIC128
+        return AdjointJacobianC64 if use_csingle else AdjointJacobianC128
+
+    def _mebibytesToBytes(mebibytes):
+        return mebibytes * 1024 * 1024
+
     allowed_operations = {
         "Identity",
         "BasisState",
@@ -170,7 +203,7 @@ def _gpu_dtype(dtype):
         "SProd",
     }
 
-    class LightningGPU(LightningBase):
+    class LightningGPU(LightningBase):  # pylint: disable=too-many-instance-attributes
         """PennyLane-Lightning-GPU device.
         Args:
             wires (int): the number of wires to initialize the device with
@@ -194,11 +227,13 @@ def __init__(
             self,
             wires,
             *,
+            mpi: bool = False,
+            mpi_buf_size: int = 0,
             sync=False,
             c_dtype=np.complex128,
             shots=None,
             batch_obs: Union[bool, int] = False,
-        ):  # pylint: disable=unused-argument
+        ):  # pylint: disable=too-many-arguments
             if c_dtype is np.complex64:
                 self.use_csingle = True
             elif c_dtype is np.complex128:
@@ -209,14 +244,72 @@ def __init__(
             super().__init__(wires, shots=shots, c_dtype=c_dtype)
 
             self._dp = DevPool()
-            self._sync = sync
-            self._batch_obs = batch_obs
 
-            self._num_local_wires = self.num_wires
-            self._gpu_state = _gpu_dtype(c_dtype)(self._num_local_wires)
+            if not mpi:
+                self._mpi = False
+                self._num_local_wires = self.num_wires
+                self._gpu_state = _gpu_dtype(c_dtype)(self._num_local_wires)
+            else:
+                self._mpi = True
+                self._mpi_init_helper(self.num_wires)
+
+                if mpi_buf_size < 0:
+                    raise TypeError(f"Unsupported mpi_buf_size value: {mpi_buf_size}")
 
+                if mpi_buf_size:
+                    if mpi_buf_size & (mpi_buf_size - 1):
+                        raise TypeError(
+                            f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2."
+                        )
+                    # Memory size in bytes
+                    sv_memsize = np.dtype(c_dtype).itemsize * (1 << self._num_local_wires)
+                    if _mebibytesToBytes(mpi_buf_size) > sv_memsize:
+                        w_msg = "The MPI buffer size is larger than the local state vector size."
+                        warn(
+                            w_msg,
+                            RuntimeWarning,
+                        )
+
+                self._gpu_state = _gpu_dtype(c_dtype, mpi)(
+                    self._mpi_manager,
+                    self._devtag,
+                    mpi_buf_size,
+                    self._num_global_wires,
+                    self._num_local_wires,
+                )
+
+            self._sync = sync
+            self._batch_obs = batch_obs
             self._create_basis_state(0)
 
+        def _mpi_init_helper(self, num_wires):
+            if not MPI_SUPPORT:
+                raise ImportError("MPI related APIs are not found.")
+            # initialize MPIManager and config check in the MPIManager ctor
+            self._mpi_manager = MPIManager()
+            # check if number of GPUs per node is larger than
+            # number of processes per node
+            numDevices = self._dp.getTotalDevices()
+            numProcsNode = self._mpi_manager.getSizeNode()
+            if numDevices < numProcsNode:
+                raise ValueError(
+                    "Number of devices should be larger than or equal to the number of processes on each node."
+                )
+            # check if the process number is larger than number of statevector elements
+            if self._mpi_manager.getSize() > (1 << (num_wires - 1)):
+                raise ValueError(
+                    "Number of processes should be smaller than the number of statevector elements."
+                )
+            # set the number of global and local wires
+            commSize = self._mpi_manager.getSize()
+            self._num_global_wires = commSize.bit_length() - 1
+            self._num_local_wires = num_wires - self._num_global_wires
+            # set GPU device
+            rank = self._mpi_manager.getRank()
+            deviceid = rank % numProcsNode
+            self._dp.setDeviceID(deviceid)
+            self._devtag = DevTag(deviceid)
+
         @staticmethod
         def _asarray(arr, dtype=None):
             arr = np.asarray(arr)  # arr is not copied
@@ -266,11 +359,19 @@ def state(self):
         @property
         def create_ops_list(self):
             """Returns create_ops_list function of the matching precision."""
+            if self._mpi:
+                return create_ops_listMPIC64 if self.use_csingle else create_ops_listMPIC128
             return create_ops_listC64 if self.use_csingle else create_ops_listC128
 
         @property
         def measurements(self):
             """Returns Measurements constructor of the matching precision."""
+            if self._mpi:
+                return (
+                    MeasurementsMPIC64(self._gpu_state)
+                    if self.use_csingle
+                    else MeasurementsMPIC128(self._gpu_state)
+                )
             return (
                 MeasurementsC64(self._gpu_state)
                 if self.use_csingle
@@ -345,6 +446,11 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
                 if self.num_wires == self._num_local_wires:
                     self.syncH2D(self._reshape(state, output_shape))
                     return
+                local_state = np.zeros(1 << self._num_local_wires, dtype=self.C_DTYPE)
+                self._mpi_manager.Scatter(state, local_state, 0)
+                # Initialize the entire device state with the input state
+                self.syncH2D(self._reshape(local_state, output_shape))
+                return
 
             # generate basis states on subset of qubits via the cartesian product
             basis_states = np.array(list(product([0, 1], repeat=len(device_wires))))
@@ -550,7 +656,9 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
             # Check adjoint diff support
             self._check_adjdiff_supported_operations(tape.operations)
 
-            processed_data = self._process_jacobian_tape(tape, starting_state, use_device_state)
+            processed_data = self._process_jacobian_tape(
+                tape, starting_state, use_device_state, self._mpi
+            )
 
             if not processed_data:  # training_params is empty
                 return np.array([], dtype=self.state.dtype)
@@ -565,7 +673,7 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
             - Evenly distribute the observables over all available GPUs (`batch_obs=True`): This will evenly split the data into ceil(num_obs/num_gpus) chunks, and allocate enough space on each GPU up-front before running through them concurrently. This relies on C++ threads to handle the orchestration.
             - Allocate at most `n` observables per GPU (`batch_obs=n`): Providing an integer value restricts each available GPU to at most `n` copies of the statevector, and hence `n` given observables for a given batch. This will iterate over the data in chnuks of size `n*num_gpus`.
             """
-            adjoint_jacobian = AdjointJacobianC64() if self.use_csingle else AdjointJacobianC128()
+            adjoint_jacobian = _adj_dtype(self.use_csingle, self._mpi)()
 
             if self._batch_obs:
                 adjoint_jacobian = adjoint_jacobian.batched
@@ -661,7 +769,19 @@ def expval(self, observable, shot_range=None, bin_size=None):
                 return np.squeeze(np.mean(samples, axis=0))
 
             if observable.name in ["SparseHamiltonian"]:
-                CSR_SparseHamiltonian = observable.sparse_matrix().tocsr()
+                if self._mpi:
+                    # Identity for CSR_SparseHamiltonian to pass to processes with rank != 0 to reduce
+                    # host(cpu) memory requirements
+                    obs = qml.Identity(0)
+                    Hmat = qml.Hamiltonian([1.0], [obs]).sparse_matrix()
+                    H_sparse = qml.SparseHamiltonian(Hmat, wires=range(1))
+                    CSR_SparseHamiltonian = H_sparse.sparse_matrix().tocsr()
+                    # CSR_SparseHamiltonian for rank == 0
+                    if self._mpi_manager.getRank() == 0:
+                        CSR_SparseHamiltonian = observable.sparse_matrix().tocsr()
+                else:
+                    CSR_SparseHamiltonian = observable.sparse_matrix().tocsr()
+
                 return self.measurements.expval(
                     CSR_SparseHamiltonian.indptr,
                     CSR_SparseHamiltonian.indices,
@@ -671,6 +791,10 @@ def expval(self, observable, shot_range=None, bin_size=None):
             # use specialized functors to compute expval(Hermitian)
             if observable.name == "Hermitian":
                 observable_wires = self.map_wires(observable.wires)
+                if self._mpi and len(observable_wires) > self._num_local_wires:
+                    raise RuntimeError(
+                        "MPI backend does not support Hermitian with number of target wires larger than local wire number."
+                    )
                 matrix = observable.matrix()
                 return self.measurements.expval(matrix, observable_wires)
 
@@ -679,9 +803,9 @@ def expval(self, observable, shot_range=None, bin_size=None):
                 or (observable.arithmetic_depth > 0)
                 or isinstance(observable.name, List)
             ):
-                ob_serialized = QuantumScriptSerializer(self.short_name, self.use_csingle)._ob(
-                    observable, self.wire_map
-                )
+                ob_serialized = QuantumScriptSerializer(
+                    self.short_name, self.use_csingle, self._mpi
+                )._ob(observable, self.wire_map)
                 return self.measurements.expval(ob_serialized)
 
             # translate to wire labels used by device
@@ -694,8 +818,10 @@ def probability_lightning(self, wires=None):
             observable_wires = self.map_wires(wires)
             # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
             local_prob = self.measurements.probs(observable_wires)
-            num_local_wires = len(local_prob).bit_length() - 1 if len(local_prob) > 0 else 0
-            return local_prob.reshape([2] * num_local_wires).transpose().reshape(-1)
+            if len(local_prob) > 0:
+                num_local_wires = len(local_prob).bit_length() - 1 if len(local_prob) > 0 else 0
+                return local_prob.reshape([2] * num_local_wires).transpose().reshape(-1)
+            return local_prob
 
         # pylint: disable=missing-function-docstring
         def var(self, observable, shot_range=None, bin_size=None):
@@ -718,9 +844,9 @@ def var(self, observable, shot_range=None, bin_size=None):
                 or (observable.arithmetic_depth > 0)
                 or isinstance(observable.name, List)
             ):
-                ob_serialized = QuantumScriptSerializer(self.short_name, self.use_csingle)._ob(
-                    observable, self.wire_map
-                )
+                ob_serialized = QuantumScriptSerializer(
+                    self.short_name, self.use_csingle, self._mpi
+                )._ob(observable, self.wire_map)
                 return self.measurements.var(ob_serialized)
 
             # translate to wire labels used by device
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b377d8acd1..642a74ad27 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -6,5 +6,7 @@ pybind11
 pytest
 pytest-cov
 pytest-mock
+pre-commit>=2.19.0
 black==23.7.0
-clang-format==14
\ No newline at end of file
+clang-format==14
+pylint
\ No newline at end of file
diff --git a/tests/test_adjoint_jacobian.py b/tests/test_adjoint_jacobian.py
index c8d65dd093..41a9784dc0 100644
--- a/tests/test_adjoint_jacobian.py
+++ b/tests/test_adjoint_jacobian.py
@@ -25,6 +25,7 @@
 from pennylane import QNode, qnode
 from pennylane import qchem
 
+
 I, X, Y, Z = (
     np.eye(2),
     qml.PauliX.compute_matrix(),
@@ -892,7 +893,10 @@ def circuit_ansatz(params, wires):
     qml.RX(params[29], wires=wires[1])
 
 
-@pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+@pytest.mark.skipif(
+    device_name != "lightning.gpu" or not ld._CPP_BINARY_AVAILABLE,
+    reason="Lightning binary required",
+)
 def test_tape_qchem(tol):
     """Tests the circuit Ansatz with a QChem Hamiltonian produces correct results"""
 
@@ -944,6 +948,60 @@ def circuit(params):
     assert np.allclose(qml.grad(circuit_ld)(params), qml.grad(circuit_dq)(params), tol)
 
 
+custom_wires = ["alice", 3.14, -1, 0]
+
+
+@pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+@pytest.mark.parametrize(
+    "returns",
+    [
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [0.1],
+                [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[1])],
+            ).sparse_matrix(custom_wires),
+            wires=custom_wires,
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [2.0],
+                [qml.PauliX(wires=custom_wires[2]) @ qml.PauliZ(wires=custom_wires[0])],
+            ).sparse_matrix(custom_wires),
+            wires=custom_wires,
+        ),
+        qml.SparseHamiltonian(
+            qml.Hamiltonian(
+                [1.1],
+                [qml.PauliX(wires=custom_wires[0]) @ qml.PauliZ(wires=custom_wires[2])],
+            ).sparse_matrix(custom_wires),
+            wires=custom_wires,
+        ),
+    ],
+)
+def test_adjoint_SparseHamiltonian(returns):
+    """Integration tests that compare to default.qubit for a large circuit containing parametrized
+    operations and when using custom wire labels"""
+
+    dev = qml.device(device_name, wires=custom_wires)
+    dev_default = qml.device("default.qubit", wires=custom_wires)
+
+    def circuit(params):
+        circuit_ansatz(params, wires=custom_wires)
+        return qml.expval(returns)
+
+    n_params = 30
+    np.random.seed(1337)
+    params = np.random.rand(n_params)
+
+    qnode = qml.QNode(circuit, dev, diff_method="adjoint")
+    qnode_default = qml.QNode(circuit, dev_default, diff_method="parameter-shift")
+
+    j_device = qml.jacobian(qnode)(params)
+    j_default = qml.jacobian(qnode_default)(params)
+
+    assert np.allclose(j_device, j_default)
+
+
 @pytest.mark.parametrize(
     "returns",
     [
@@ -1024,9 +1082,6 @@ def casted_to_array_batched(params):
     assert np.allclose(j_def, j_lightning_batched)
 
 
-custom_wires = ["alice", 3.14, -1, 0]
-
-
 @pytest.mark.parametrize(
     "returns",
     [
@@ -1218,7 +1273,7 @@ def create_xyz_file(tmp_path_factory):
     [False, True, 1, 2, 3, 4],
 )
 def test_integration_H2_Hamiltonian(create_xyz_file, batches):
-    skipp_condn = pytest.importorskip("openfermionpyscf")
+    _ = pytest.importorskip("openfermionpyscf")
     n_electrons = 2
     np.random.seed(1337)
 
@@ -1232,9 +1287,10 @@ def test_integration_H2_Hamiltonian(create_xyz_file, batches):
         active_electrons=n_electrons,
         name="h2",
         outpath=str(str_path.parent),
+        load_data=True,
     )
     hf_state = qml.qchem.hf_state(n_electrons, qubits)
-    singles, doubles = qml.qchem.excitations(n_electrons, qubits)
+    _, doubles = qml.qchem.excitations(n_electrons, qubits)
 
     # Choose different batching supports here
     dev = qml.device(device_name, wires=qubits, batch_obs=batches)
diff --git a/tests/test_device.py b/tests/test_device.py
index a45f269fb1..4039394276 100644
--- a/tests/test_device.py
+++ b/tests/test_device.py
@@ -48,3 +48,18 @@ def test_create_device_with_unsupported_dtype():
 def test_create_device_with_unsupported_kokkos_args():
     with pytest.raises(TypeError, match="Argument kokkos_args must be of type"):
         dev = qml.device(device_name, wires=1, kokkos_args=np.complex256)
+
+
+@pytest.mark.skipif(
+    device_name != "lightning.gpu" or not ld._CPP_BINARY_AVAILABLE,
+    reason="Only lightning.gpu has a kwarg mpi_buf_size.",
+)
+def test_create_device_with_unsupported_mpi_buf_size():
+    try:
+        from mpi4py import MPI
+
+        with pytest.raises(ImportError, match="MPI related APIs are not found"):
+            dev = qml.device(device_name, wires=1)
+            dev._mpi_init_helper(1)
+    except:
+        pass
diff --git a/tests/test_serialize.py b/tests/test_serialize.py
index ab6df3c26e..360ac0e71b 100644
--- a/tests/test_serialize.py
+++ b/tests/test_serialize.py
@@ -34,6 +34,8 @@
         TensorProdObsC128,
         HamiltonianC64,
         HamiltonianC128,
+        SparseHamiltonianC64,
+        SparseHamiltonianC128,
     )
 elif device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu_ops.observables import (
@@ -45,6 +47,8 @@
         TensorProdObsC128,
         HamiltonianC64,
         HamiltonianC128,
+        SparseHamiltonianC64,
+        SparseHamiltonianC128,
     )
 else:
     from pennylane_lightning.lightning_qubit_ops.observables import (
@@ -56,6 +60,8 @@
         TensorProdObsC128,
         HamiltonianC64,
         HamiltonianC128,
+        SparseHamiltonianC64,
+        SparseHamiltonianC128,
     )
 
 
@@ -92,6 +98,10 @@ def test_wrong_device_name():
         (qml.Projector([0], wires=0), HermitianObsC128),
         (qml.Hamiltonian([1], [qml.PauliZ(0)]), HamiltonianC128),
         (qml.sum(qml.Hadamard(0), qml.PauliX(1)), HermitianObsC128),
+        (
+            qml.SparseHamiltonian(qml.Hamiltonian([1], [qml.PauliZ(0)]).sparse_matrix(), wires=[0]),
+            SparseHamiltonianC128,
+        ),
     ],
 )
 def test_obs_returns_expected_type(obs, obs_type):