From 3718dd66b15561faa97166a2cbf16394ead9d7f5 Mon Sep 17 00:00:00 2001
From: Mudit Pandey <mudit.pandey@xanadu.ai>
Date: Tue, 17 Sep 2024 17:32:48 -0400
Subject: [PATCH 1/5] Test changes to account for 0.39 deprecations (#912)

Updating some tests to account for various deprecations happening in
pennylane for v0.39.

---------

Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
---
 .github/CHANGELOG.md                 | 5 ++++-
 mpitests/test_adjoint_jacobian.py    | 6 +++---
 pennylane_lightning/core/_version.py | 2 +-
 tests/test_adjoint_jacobian.py       | 2 +-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 0262afc467..ce2d763cf0 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -31,6 +31,9 @@
 
 ### Improvements
 
+* Update the test suite to remove deprecated code.
+  [(#912)](https://github.com/PennyLaneAI/pennylane-lightning/pull/912)
+
 * Skip the compilation of Lightning simulators and development requirements to boost the build of public docs up to 5x.
   [(#904)](https://github.com/PennyLaneAI/pennylane-lightning/pull/904)
 
@@ -75,7 +78,7 @@
 
 This release contains contributions from (in alphabetical order):
 
-Ali Asadi, Amintor Dusko, Luis Alfredo Nuñez Meneses, Vincent Michaud-Rioux, Lee J. O'Riordan, Shuli Shu
+Ali Asadi, Amintor Dusko, Luis Alfredo Nuñez Meneses, Vincent Michaud-Rioux, Lee J. O'Riordan, Mudit Pandey, Shuli Shu
 
 ---
 
diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index a1d4be0260..6f3b5c7f5b 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -1378,7 +1378,7 @@ def circuit(params):
 def test_qubit_unitary(dev, n_targets):
     """Tests that ``qml.QubitUnitary`` can be included in circuits differentiated with the adjoint method."""
     n_wires = len(dev.wires)
-    dev_def = qml.device("default.qubit.legacy", wires=n_wires)
+    dev_def = qml.device("default.qubit", wires=n_wires)
     h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
     c_dtype = np.complex64 if dev.R_DTYPE == np.float32 else np.complex128
 
@@ -1396,7 +1396,7 @@ def test_qubit_unitary(dev, n_targets):
 
     init_state = np.array(init_state, requires_grad=False, dtype=c_dtype)
     U = np.array(U, requires_grad=False, dtype=c_dtype)
-    obs = qml.operation.Tensor(*(qml.PauliZ(i) for i in range(n_wires)))
+    obs = qml.prod(*(qml.PauliZ(i) for i in range(n_wires)))
 
     def circuit(x):
         qml.StatePrep(init_state, wires=range(n_wires))
@@ -1444,7 +1444,7 @@ def test_diff_qubit_unitary(dev, n_targets):
 
     init_state = np.array(init_state, requires_grad=False, dtype=c_dtype)
     U = np.array(U, requires_grad=False, dtype=c_dtype)
-    obs = qml.operation.Tensor(*(qml.PauliZ(i) for i in range(n_wires)))
+    obs = qml.prod(*(qml.PauliZ(i) for i in range(n_wires)))
 
     def circuit(x, u_mat):
         qml.StatePrep(init_state, wires=range(n_wires))
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 065f0933a9..1d0b6fde2a 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev24"
+__version__ = "0.39.0-dev25"
diff --git a/tests/test_adjoint_jacobian.py b/tests/test_adjoint_jacobian.py
index 57f007a45f..83713b1c69 100644
--- a/tests/test_adjoint_jacobian.py
+++ b/tests/test_adjoint_jacobian.py
@@ -1544,7 +1544,7 @@ def test_qubit_unitary(n_targets):
     """Tests that ``qml.QubitUnitary`` can be included in circuits differentiated with the adjoint method."""
     n_wires = 6
     dev = qml.device(device_name, wires=n_wires)
-    dev_def = qml.device("default.qubit.legacy", wires=n_wires)
+    dev_def = qml.device("default.qubit", wires=n_wires)
 
     np.random.seed(1337)
     init_state = np.random.rand(2**n_wires) + 1j * np.random.rand(2**n_wires)

From eb709d5c81ba2a180c8f2caecf20d8e92e5e40aa Mon Sep 17 00:00:00 2001
From: Shuli Shu <31480676+multiphaseCFD@users.noreply.github.com>
Date: Wed, 18 Sep 2024 12:54:26 -0400
Subject: [PATCH 2/5] Add MPO support  to `lightning.tensor` (#859)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Before submitting

Please complete the following checklist when submitting a PR:

- [ ] All new features must include a unit test.
If you've fixed a bug or added code that should be tested, add a test to
the
      [`tests`](../tests) directory!

- [ ] All new functions and code must be clearly commented and
documented.
If you do make documentation changes, make sure that the docs build and
      render correctly by running `make docs`.

- [ ] Ensure that the test suite passes, by running `make test`.

- [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing
the
      change, and including a link back to the PR.

- [x] Ensure that code is properly formatted by running `make format`.

When all the above are checked, delete everything above the dashed
line and fill in the pull request template.


------------------------------------------------------------------------------------------------------------

**Context:**

[SC-70902]

MPO(Matrix Product Operator) is added to `lightning.tensor` for all
gates support. Any 2 more wires gates and 1 more target wire controlled
gates can be supported via `MPO`. Current implementation only accepts
the decomposed gate data. For the python users, `MPO` decomposition is
conducted in the python layer with `numpy`. Further improvement can be
done by conducting MPO decomposition with `cusolver` for large unitary
matrices.

**Description of the Change:**

**Benefits:**

**Possible Drawbacks:**

**Related GitHub Issues:**

---------

Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
Co-authored-by: Shiro-Raven <exclass9.24@gmail.com>
Co-authored-by: albi3ro <chrissie.c.l@gmail.com>
Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
Co-authored-by: Luis Alfredo Nuñez Meneses <alfredo.nunez@xanadu.ai>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: vincentmr <vincentmr@users.noreply.github.com>
Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>
Co-authored-by: Lee James O'Riordan <mlxd@users.noreply.github.com>
Co-authored-by: paul0403 <79805239+paul0403@users.noreply.github.com>
Co-authored-by: Raul Torres <138264735+rauletorresc@users.noreply.github.com>
---
 .github/CHANGELOG.md                          |   3 +
 pennylane_lightning/core/_version.py          |   2 +-
 .../lightning_tensor/tncuda/MPOTNCuda.cpp     |  20 ++
 .../lightning_tensor/tncuda/MPOTNCuda.hpp     | 227 ++++++++++++++++++
 .../lightning_tensor/tncuda/MPSTNCuda.hpp     |  90 +++++++
 .../tncuda/bindings/LTensorTNCudaBindings.hpp |  16 ++
 .../gates/tests/Test_MPSTNCuda_NonParam.cpp   | 126 ++++++++++
 .../gates/tests/Test_MPSTNCuda_Param.cpp      |  75 ++++++
 .../tncuda/tests/Tests_MPSTNCuda.cpp          |  37 +++
 .../utils/tncuda_utils/CMakeLists.txt         |   4 +
 .../utils/tncuda_utils/tests/CMakeLists.txt   |  33 +++
 .../tncuda_utils/tests/Test_TNCuda_utils.cpp  |  86 +++++++
 .../runner_lightning_tensor_tncuda_utils.cpp  |   2 +
 .../utils/tncuda_utils/tncuda_helpers.hpp     |  90 +++++++
 .../lightning_tensor/_tensornet.py            | 151 +++++++++---
 .../lightning_tensor/lightning_tensor.py      |  41 +++-
 .../test_measurements_class.py                |  46 +---
 .../lightning_tensor/test_gates_and_expval.py |  58 +++--
 .../lightning_tensor/test_tensornet_class.py  |  57 ++++-
 tests/test_apply.py                           |   4 -
 tests/test_execute.py                         |   7 +-
 tests/test_gates.py                           |  43 ++--
 tests/test_measurements.py                    |   2 +-
 tests/test_templates.py                       |  49 ++--
 24 files changed, 1112 insertions(+), 157 deletions(-)
 create mode 100644 pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPOTNCuda.cpp
 create mode 100644 pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPOTNCuda.hpp
 create mode 100644 pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/CMakeLists.txt
 create mode 100644 pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/Test_TNCuda_utils.cpp
 create mode 100644 pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/runner_lightning_tensor_tncuda_utils.cpp

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index ce2d763cf0..334d76ffab 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### New features since last release
 
+* Add Matrix Product Operator (MPO) for all gates support to `lightning.tensor`. Note current C++ implementation only works for MPO sites data provided by users. 
+  [(#859)](https://github.com/PennyLaneAI/pennylane-lightning/pull/859)
+
 * Add shot measurement support to `lightning.tensor`.
   [(#852)](https://github.com/PennyLaneAI/pennylane-lightning/pull/852)
 
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 1d0b6fde2a..ebebbc9f97 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev25"
+__version__ = "0.39.0-dev26"
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPOTNCuda.cpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPOTNCuda.cpp
new file mode 100644
index 0000000000..6f3873472f
--- /dev/null
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPOTNCuda.cpp
@@ -0,0 +1,20 @@
+// Copyright 2024 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "MPOTNCuda.hpp"
+
+using namespace Pennylane::LightningTensor::TNCuda;
+
+template class Pennylane::LightningTensor::TNCuda::MPOTNCuda<float>;
+template class Pennylane::LightningTensor::TNCuda::MPOTNCuda<double>;
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPOTNCuda.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPOTNCuda.hpp
new file mode 100644
index 0000000000..0704572536
--- /dev/null
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPOTNCuda.hpp
@@ -0,0 +1,227 @@
+// Copyright 2024 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file MPOTNCuda.hpp
+ * Class for cuTensorNet-backed Matrix Product Operator.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cuComplex.h>
+#include <cutensornet.h>
+#include <vector>
+
+#include "TensorCuda.hpp"
+#include "tncudaError.hpp"
+#include "tncuda_helpers.hpp"
+
+namespace {
+namespace cuUtil = Pennylane::LightningGPU::Util;
+}
+
+namespace Pennylane::LightningTensor::TNCuda {
+
+/**
+ * @brief Class representing an Matrix Product Operator (MPO) object for the MPS
+ backend.
+ * Any gate tensor can be represented as an MPO tensor network in the context of
+ MPS. The gate tensor must be decomposed with respect to its target wires. Note
+ that the only local target wires are supported. The non-adjacent target wires
+ must be swapped to local before contructing the MPO tensor network.
+ * The MPO tensors' modes order in an open boundary condition are:
+   2              3              2
+   |              |              |
+   X--1--....--0--X--2--....--0--X
+   |              |              |
+   0              1              1
+
+ * The extents of the MPO tensors are [bondL, 2, bondR, 2]. The bondL of the
+ left side bound MPO tensor is 1 and the bondR of the right side bound MPO
+ tensor is 1.
+
+ * Note that the gate tensor should be permuted to ascending order and
+ decomposed into MPO sites before passing to this class. Preprocess and
+ postprocess with SWAP operations are required to ensure MPOs target at adjacent
+ wires and the target wires are correct.
+
+ * @tparam PrecisionT Floating point type.
+ */
+template <class PrecisionT> class MPOTNCuda {
+  private:
+    using ComplexT = std::complex<PrecisionT>;
+    using CFP_t = decltype(cuUtil::getCudaType(PrecisionT{}));
+
+    cutensornetNetworkOperator_t MPOOperator_;
+    cuDoubleComplex coeff_ =
+        make_cuDoubleComplex(1.0, 0.0); // default coefficient
+    cutensornetBoundaryCondition_t boundaryCondition_{
+        CUTENSORNET_BOUNDARY_CONDITION_OPEN}; // open boundary condition
+    int64_t componentIdx_;
+
+    std::vector<std::size_t> bondDims_;
+
+    std::size_t numMPOSites_;
+    std::vector<int32_t> MPO_modes_int32_;
+
+    std::vector<std::vector<int64_t>> modesExtents_int64_;
+    // TODO: Explore if tensors_ can be stored in a separate memory manager
+    // class
+    std::vector<std::shared_ptr<TensorCuda<PrecisionT>>> tensors_;
+
+    /**
+     * @brief Get a vector of pointers to extents of each site.
+     *
+     * @return std::vector<int64_t const *> Note int64_t const* is
+     * required by cutensornet backend.
+     */
+    [[nodiscard]] auto getModeExtentsPtr_() -> std::vector<int64_t const *> {
+        std::vector<int64_t const *> modeExtentsPtr_int64;
+        for (auto it = modesExtents_int64_.cbegin();
+             it != modesExtents_int64_.cend(); it++) {
+            modeExtentsPtr_int64.emplace_back(it->data());
+        }
+        return modeExtentsPtr_int64;
+    }
+
+    /**
+     * @brief Get a vector of pointers to tensor data of each site.
+     *
+     * @return std::vector<void *>
+     */
+    [[nodiscard]] auto getTensorsDataPtr_() -> std::vector<void *> {
+        std::vector<void *> tensorsDataPtr;
+        for (auto &tensor : tensors_) {
+            tensorsDataPtr.emplace_back(
+                reinterpret_cast<void *>(tensor->getDataBuffer().getData()));
+        }
+        return tensorsDataPtr;
+    }
+
+  public:
+    explicit MPOTNCuda(const std::vector<std::vector<ComplexT>> &tensors,
+                       const std::vector<std::size_t> &wires,
+                       const std::size_t maxMPOBondDim,
+                       const std::size_t numQubits,
+                       const cutensornetHandle_t &cutensornetHandle,
+                       const cudaDataType_t &cudaDataType,
+                       const DevTag<int> &dev_tag) {
+        PL_ABORT_IF_NOT(tensors.size() == wires.size(),
+                        "Number of tensors and wires must match.");
+
+        PL_ABORT_IF(maxMPOBondDim < 2,
+                    "Max MPO bond dimension must be at least 2.");
+
+        PL_ABORT_IF_NOT(std::is_sorted(wires.begin(), wires.end()),
+                        "Only sorted target wires is accepeted.");
+
+        PL_ABORT_IF_NOT(wires.size() == wires.back() - wires.front() + 1,
+                        "Only support local target wires.");
+
+        // Create an empty MPO tensor network operator. Note that the state
+        // extents are aligned with the quantum state.
+        PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateNetworkOperator(
+            /* const cutensornetHandle_t */ cutensornetHandle,
+            /* int32_t */ static_cast<int32_t>(numQubits),
+            /* const int64_t stateModeExtents */
+            std::vector<int64_t>(numQubits, 2).data(),
+            /* cudaDataType_t */ cudaDataType,
+            /* cutensornetNetworkOperator_t */ &MPOOperator_));
+
+        numMPOSites_ = wires.size();
+
+        MPO_modes_int32_.resize(numMPOSites_);
+
+        std::iota(MPO_modes_int32_.begin(), MPO_modes_int32_.end(),
+                  wires.front());
+
+        std::transform(MPO_modes_int32_.begin(), MPO_modes_int32_.end(),
+                       MPO_modes_int32_.begin(),
+                       [&numQubits](const std::size_t mode) {
+                           return static_cast<int32_t>(numQubits - 1 - mode);
+                       });
+
+        // Ensure the modes are in ascending order
+        std::reverse(MPO_modes_int32_.begin(), MPO_modes_int32_.end());
+
+        for (std::size_t i = 0; i < numMPOSites_ - 1; i++) {
+            // Binary logarithm of the bond dimension required for the exact MPO
+            // decomposition
+            const std::size_t lg_bondDim_exact =
+                std::min(i + 1, numMPOSites_ - i - 1) *
+                2; // 1+1 (1 for bra and 1 for ket)
+
+            const std::size_t bondDim =
+                lg_bondDim_exact <= log2(maxMPOBondDim)
+                    ? (std::size_t{1} << lg_bondDim_exact)
+                    : maxMPOBondDim;
+
+            bondDims_.emplace_back(bondDim);
+        }
+
+        for (std::size_t i = 0; i < numMPOSites_; i++) {
+            const std::size_t bondDimR =
+                i < numMPOSites_ - 1 ? bondDims_[i] : 1;
+            const std::size_t bondDimL = i > 0 ? bondDims_[i - 1] : 1;
+
+            auto localModesExtents =
+                i == 0 ? std::vector<std::size_t>{2, bondDimR, 2}
+                : i == numMPOSites_ - 1
+                    ? std::vector<std::size_t>{bondDimL, 2, 2}
+                    : std::vector<std::size_t>{bondDimL, 2, bondDimR, 2};
+
+            modesExtents_int64_.emplace_back(
+                Pennylane::Util::cast_vector<std::size_t, int64_t>(
+                    localModesExtents));
+
+            tensors_.emplace_back(std::make_shared<TensorCuda<PrecisionT>>(
+                localModesExtents.size(), localModesExtents, localModesExtents,
+                dev_tag));
+
+            auto tensor_cu = cuUtil::complexToCu<ComplexT>(tensors[i]);
+            tensors_[i]->getDataBuffer().CopyHostDataToGpu(tensor_cu.data(),
+                                                           tensor_cu.size());
+        }
+
+        // Append MPO tensor network operator components
+        PL_CUTENSORNET_IS_SUCCESS(cutensornetNetworkOperatorAppendMPO(
+            /* const cutensornetHandle_t */ cutensornetHandle,
+            /* cutensornetNetworkOperator_t */ MPOOperator_,
+            /* const cuDoubleComplex */ coeff_,
+            /* int32_t numStateModes */ static_cast<int32_t>(numMPOSites_),
+            /* const int32_t stateModes[] */ MPO_modes_int32_.data(),
+            /* const int64_t *stateModeExtents[] */
+            getModeExtentsPtr_().data(),
+            /* const int64_t *tensorModeStrides[] */ nullptr,
+            /* const void * */
+            const_cast<const void **>(getTensorsDataPtr_().data()),
+            /* cutensornetBoundaryCondition_t */ boundaryCondition_,
+            /* int64_t * */ &componentIdx_));
+    }
+
+    auto getMPOOperator() const -> const cutensornetNetworkOperator_t & {
+        return MPOOperator_;
+    }
+
+    auto getBondDims() const -> const std::vector<std::size_t> & {
+        return bondDims_;
+    }
+
+    ~MPOTNCuda() {
+        PL_CUTENSORNET_IS_SUCCESS(
+            cutensornetDestroyNetworkOperator(MPOOperator_));
+    };
+};
+} // namespace Pennylane::LightningTensor::TNCuda
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp
index 81a115e685..dd37b3c994 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp
@@ -29,6 +29,7 @@
 
 #include "DataBuffer.hpp"
 #include "DevTag.hpp"
+#include "MPOTNCuda.hpp"
 #include "TNCudaBase.hpp"
 #include "TensorCuda.hpp"
 #include "TensornetBase.hpp"
@@ -73,6 +74,9 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
 
     std::vector<TensorCuda<Precision>> tensors_out_;
 
+    std::vector<std::shared_ptr<MPOTNCuda<Precision>>> mpos_;
+    std::vector<std::size_t> mpo_ids_;
+
   public:
     using CFP_t = decltype(cuUtil::getCudaType(Precision{}));
     using ComplexT = std::complex<Precision>;
@@ -232,6 +236,77 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
         }
     };
 
+    /**
+     * @brief Apply an MPO operator with the gate's MPO decomposition data
+     * provided by the user to the compute graph.
+     *
+     * This API only works for the MPS backend.
+     *
+     * @param tensors The MPO representation of a gate. Each element in the
+     * outer vector represents a MPO tensor site.
+     * @param wires The wire indices of the gate acts on. The size of this
+     * vector should match the size of the `tensors` vector.
+     * @param max_mpo_bond_dim The maximum bond dimension of the MPO operator.
+     */
+    void applyMPOOperation(const std::vector<std::vector<ComplexT>> &tensors,
+                           const std::vector<std::size_t> &wires,
+                           const std::size_t max_mpo_bond_dim) {
+        PL_ABORT_IF_NOT(
+            tensors.size() == wires.size(),
+            "The number of tensors should be equal to the number of "
+            "wires.");
+
+        // Create a queue of wire pairs to apply SWAP gates and MPO local target
+        // wires
+        const auto [local_wires, swap_wires_queue] =
+            create_swap_wire_pair_queue(wires);
+
+        // Apply SWAP gates to ensure the following MPO operator targeting at
+        // local wires
+        if (swap_wires_queue.size() > 0) {
+            for_each(swap_wires_queue.begin(), swap_wires_queue.end(),
+                     [this](const auto &swap_wires) {
+                         for_each(swap_wires.begin(), swap_wires.end(),
+                                  [this](const auto &wire_pair) {
+                                      BaseType::applyOperation(
+                                          "SWAP", wire_pair, false);
+                                  });
+                     });
+        }
+
+        // Create a MPO object based on the host data from the user
+        mpos_.emplace_back(std::make_shared<MPOTNCuda<Precision>>(
+            tensors, local_wires, max_mpo_bond_dim, BaseType::getNumQubits(),
+            BaseType::getTNCudaHandle(), BaseType::getCudaDataType(),
+            BaseType::getDevTag()));
+
+        // Append the MPO operator to the compute graph
+        // Note MPO operator only works for local target wires as of v24.08
+        int64_t operatorId;
+        PL_CUTENSORNET_IS_SUCCESS(cutensornetStateApplyNetworkOperator(
+            /* const cutensornetHandle_t */ BaseType::getTNCudaHandle(),
+            /* cutensornetState_t */ BaseType::getQuantumState(),
+            /* cutensornetNetworkOperator_t */ mpos_.back()->getMPOOperator(),
+            /* const int32_t immutable */ 1,
+            /* const int32_t adjoint */ 0,
+            /* const int32_t unitary */ 1,
+            /* int64_t * operatorId*/ &operatorId));
+
+        mpo_ids_.push_back(static_cast<std::size_t>(operatorId));
+
+        // Apply SWAP gates to restore the original wire order
+        if (swap_wires_queue.size() > 0) {
+            for_each(swap_wires_queue.rbegin(), swap_wires_queue.rend(),
+                     [this](const auto &swap_wires) {
+                         for_each(swap_wires.rbegin(), swap_wires.rend(),
+                                  [this](const auto &wire_pair) {
+                                      BaseType::applyOperation(
+                                          "SWAP", wire_pair, false);
+                                  });
+                     });
+        }
+    }
+
     /**
      * @brief Append MPS final state to the quantum circuit.
      *
@@ -276,6 +351,21 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
             /* const void * */ &cutoff,
             /* std::size_t */ sizeof(cutoff)));
 
+        // MPO configurations
+        // Note that CUTENSORNET_STATE_MPO_APPLICATION_INEXACT is applied if the
+        // `cutoff` value is not set to 0 for the MPO application.
+        cutensornetStateMPOApplication_t mpo_attribute =
+            (cutoff == 0) ? CUTENSORNET_STATE_MPO_APPLICATION_EXACT
+                          : CUTENSORNET_STATE_MPO_APPLICATION_INEXACT;
+
+        PL_CUTENSORNET_IS_SUCCESS(cutensornetStateConfigure(
+            /* const cutensornetHandle_t */ BaseType::getTNCudaHandle(),
+            /* cutensornetState_t */ BaseType::getQuantumState(),
+            /* cutensornetStateAttributes_t */
+            CUTENSORNET_STATE_CONFIG_MPS_MPO_APPLICATION,
+            /* const void * */ &mpo_attribute,
+            /* std::size_t */ sizeof(mpo_attribute)));
+
         BaseType::computeState(
             const_cast<int64_t **>(getSitesExtentsPtr().data()),
             reinterpret_cast<void **>(getTensorsOutDataPtr().data()));
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/bindings/LTensorTNCudaBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/bindings/LTensorTNCudaBindings.hpp
index cc494db85e..2e7825bbb6 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/bindings/LTensorTNCudaBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/bindings/LTensorTNCudaBindings.hpp
@@ -150,6 +150,22 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
                 tensor_network.setBasisState(basisState);
             },
             "Create Basis State on GPU.")
+        .def(
+            "applyMPOOperation",
+            [](TensorNet &tensor_network, std::vector<np_arr_c> &tensors,
+               std::vector<std::size_t> &wires, const std::size_t MPOBondDims) {
+                using ComplexT = typename TensorNet::ComplexT;
+                std::vector<std::vector<ComplexT>> conv_tensors;
+                for (const auto &tensor : tensors) {
+                    py::buffer_info numpyArrayInfo = tensor.request();
+                    auto *m_ptr = static_cast<ComplexT *>(numpyArrayInfo.ptr);
+                    conv_tensors.push_back(
+                        std::vector<ComplexT>{m_ptr, m_ptr + tensor.size()});
+                }
+                tensor_network.applyMPOOperation(conv_tensors, wires,
+                                                 MPOBondDims);
+            },
+            "Apply MPO to the tensor network graph.")
         .def(
             "appendMPSFinalState",
             [](TensorNet &tensor_network, double cutoff,
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
index c3826b83c3..b58fa3bbec 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
@@ -585,3 +585,129 @@ TEMPLATE_TEST_CASE("MPSTNCuda::applyControlledOperation non-param "
                           LightningException);
     }
 }
+
+TEMPLATE_TEST_CASE("MPSTNCuda::applyMPO::2+_wires", "[MPSTNCuda_NonParam]",
+                   float, double) {
+    const bool inverse = GENERATE(false, true);
+    {
+        using cp_t = std::complex<TestType>;
+        std::size_t maxExtent = 2;
+        std::size_t max_mpo_bond = 16;
+        DevTag<int> dev_tag{0, 0};
+
+        std::vector<std::vector<cp_t>> mpo_cnot(
+            2, std::vector<cp_t>(16, {0.0, 0.0}));
+
+        // in-order decomposition of the cnot operator
+        // data from scipy decompose in the lightning.tensor python layer
+        mpo_cnot[0][0] = {1.0, 0.0};
+        mpo_cnot[0][3] = {-1.0, 0.0};
+        mpo_cnot[0][9] = {1.0, 0.0};
+        mpo_cnot[0][10] = {-1.0, 0.0};
+
+        mpo_cnot[1][0] = {1.0, 0.0};
+        mpo_cnot[1][7] = {-1.0, 0.0};
+        mpo_cnot[1][10] = {1.0, 0.0};
+        mpo_cnot[1][13] = {-1.0, 0.0};
+
+        std::vector<std::vector<cp_t>> mpo_cswap;
+        mpo_cswap.emplace_back(std::vector<cp_t>(16, {0.0, 0.0}));
+        mpo_cswap.emplace_back(std::vector<cp_t>(64, {0.0, 0.0}));
+        mpo_cswap.emplace_back(std::vector<cp_t>(16, {0.0, 0.0}));
+
+        mpo_cswap[0][0] = {-1.5811388300841898, 0.0};
+        mpo_cswap[0][2] = {0.7071067811865475, 0.0};
+        mpo_cswap[0][5] = {-1.0, 0.0};
+        mpo_cswap[0][9] = mpo_cswap[0][0];
+        mpo_cswap[0][11] = -mpo_cswap[0][2];
+        mpo_cswap[0][14] = {1.0, 0.0};
+
+        mpo_cswap[1][0] = {-0.413452607315265, 0.0};
+        mpo_cswap[1][1] = {0.6979762349196628, 0.0};
+        mpo_cswap[1][7] = {0.9870874576374964, 0.0};
+        mpo_cswap[1][8] = {0.5736348503222318, 0.0};
+        mpo_cswap[1][9] = {0.11326595025589799, 0.0};
+        mpo_cswap[1][15] = {0.16018224300696726, 0.0};
+        mpo_cswap[1][34] = -mpo_cswap[1][7];
+        mpo_cswap[1][36] = mpo_cswap[1][0];
+        mpo_cswap[1][37] = -mpo_cswap[1][1];
+        mpo_cswap[1][42] = -mpo_cswap[1][15];
+        mpo_cswap[1][44] = mpo_cswap[1][8];
+        mpo_cswap[1][45] = -mpo_cswap[1][9];
+
+        mpo_cswap[2][0] = mpo_cswap[1][15];
+        mpo_cswap[2][1] = -mpo_cswap[1][7];
+        mpo_cswap[2][7] = {1.0, 0.0};
+        mpo_cswap[2][10] = {-1.0, 0.0};
+        mpo_cswap[2][12] = -mpo_cswap[2][1];
+        mpo_cswap[2][13] = mpo_cswap[2][0];
+
+        SECTION("Target at wire indices") {
+            std::size_t num_qubits = 3;
+
+            MPSTNCuda<TestType> mps_state{num_qubits, maxExtent, dev_tag};
+
+            MPSTNCuda<TestType> mps_state_mpo{num_qubits, maxExtent, dev_tag};
+
+            mps_state.applyOperations({"Hadamard", "Hadamard", "Hadamard"},
+                                      {{0}, {1}, {2}}, {false, false, false});
+
+            mps_state_mpo.applyOperations({"Hadamard", "Hadamard", "Hadamard"},
+                                          {{0}, {1}, {2}},
+                                          {false, false, false});
+
+            mps_state.applyOperation("CNOT", {0, 1}, inverse);
+
+            mps_state_mpo.applyMPOOperation(mpo_cnot, {0, 1}, max_mpo_bond);
+
+            auto ref = mps_state.getDataVector();
+            auto res = mps_state_mpo.getDataVector();
+
+            CHECK(res == Pennylane::Util::approx(ref));
+        }
+
+        SECTION("Target at non-adjacent wire indices") {
+            std::size_t num_qubits = 3;
+
+            MPSTNCuda<TestType> mps_state{num_qubits, maxExtent, dev_tag};
+
+            MPSTNCuda<TestType> mps_state_mpo{num_qubits, maxExtent, dev_tag};
+
+            mps_state.applyOperations({"Hadamard", "Hadamard", "Hadamard"},
+                                      {{0}, {1}, {2}}, {false, false, false});
+
+            mps_state_mpo.applyOperations({"Hadamard", "Hadamard", "Hadamard"},
+                                          {{0}, {1}, {2}},
+                                          {false, false, false});
+
+            mps_state.applyOperation("CNOT", {0, 2}, inverse);
+
+            mps_state_mpo.applyMPOOperation(mpo_cnot, {0, 2}, max_mpo_bond);
+
+            auto ref = mps_state.getDataVector();
+            auto res = mps_state_mpo.getDataVector();
+
+            CHECK(res == Pennylane::Util::approx(ref));
+        }
+
+        SECTION("Tests for 3-wire MPOs") {
+            std::size_t num_qubits = 3;
+
+            MPSTNCuda<TestType> mps_state_mpo{num_qubits, maxExtent, dev_tag};
+            MPSTNCuda<TestType> mps_state{num_qubits, maxExtent, dev_tag};
+
+            mps_state_mpo.applyOperations({"Hadamard", "Hadamard", "Hadamard"},
+                                          {{0}, {1}, {2}},
+                                          {false, false, false});
+            mps_state.applyOperations({"Hadamard", "Hadamard", "Hadamard"},
+                                      {{0}, {1}, {2}}, {false, false, false});
+
+            mps_state_mpo.applyMPOOperation(mpo_cswap, {0, 1, 2}, max_mpo_bond);
+
+            auto res = mps_state_mpo.getDataVector();
+            auto ref = mps_state.getDataVector();
+
+            CHECK(res == Pennylane::Util::approx(ref));
+        }
+    }
+}
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_Param.cpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_Param.cpp
index 37fbf90826..c5caa31715 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_Param.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_Param.cpp
@@ -1084,3 +1084,78 @@ TEMPLATE_TEST_CASE("MPSTNCuda::Param_Gates::2+_wires", "[MPSTNCuda_Param]",
         }
     }
 }
+
+TEMPLATE_TEST_CASE("MPSTNCuda::applyMPO::SingleExcitation", "[MPSTNCuda_Param]",
+                   float, double) {
+    using cp_t = std::complex<TestType>;
+    std::size_t maxExtent = 2;
+    std::size_t max_mpo_bond = 4;
+    DevTag<int> dev_tag{0, 0};
+
+    std::vector<std::vector<cp_t>> mpo_single_excitation(
+        2, std::vector<cp_t>(16, {0.0, 0.0}));
+
+    // in-order decomposition of the cnot operator
+    // data from scipy decompose in the lightning.tensor python layer
+    mpo_single_excitation[0][0] = {-1.40627352, 0.0};
+    mpo_single_excitation[0][3] = {-0.14943813, 0.0};
+    mpo_single_excitation[0][6] = {0.00794005, 0.0};
+    mpo_single_excitation[0][9] = {-1.40627352, 0.0};
+    mpo_single_excitation[0][12] = {-0.14943813, 0.0};
+    mpo_single_excitation[0][15] = {-0.00794005, 0.0};
+
+    mpo_single_excitation[1][0] = {-0.707106781, 0.0};
+    mpo_single_excitation[1][3] = {0.707106781, 0.0};
+    mpo_single_excitation[1][6] = {1.0, 0.0};
+    mpo_single_excitation[1][9] = {-1.0, 0.0};
+    mpo_single_excitation[1][12] = {-0.707106781, 0.0};
+    mpo_single_excitation[1][15] = {-0.707106781, 0.0};
+
+    SECTION("Target at wire indices") {
+        std::size_t num_qubits = 3;
+
+        MPSTNCuda<TestType> mps_state{num_qubits, maxExtent, dev_tag};
+
+        MPSTNCuda<TestType> mps_state_mpo{num_qubits, maxExtent, dev_tag};
+
+        mps_state.applyOperations({"Hadamard", "Hadamard", "Hadamard"},
+                                  {{0}, {1}, {2}}, {false, false, false});
+
+        mps_state_mpo.applyOperations({"Hadamard", "Hadamard", "Hadamard"},
+                                      {{0}, {1}, {2}}, {false, false, false});
+
+        mps_state.applyOperation("SingleExcitation", {0, 1}, false, {0.3});
+
+        mps_state_mpo.applyMPOOperation(mpo_single_excitation, {0, 1},
+                                        max_mpo_bond);
+
+        auto ref = mps_state.getDataVector();
+        auto res = mps_state_mpo.getDataVector();
+
+        CHECK(res == Pennylane::Util::approx(ref));
+    }
+
+    SECTION("Target at non-adjacent wire indices") {
+        std::size_t num_qubits = 3;
+
+        MPSTNCuda<TestType> mps_state{num_qubits, maxExtent, dev_tag};
+
+        MPSTNCuda<TestType> mps_state_mpo{num_qubits, maxExtent, dev_tag};
+
+        mps_state.applyOperations({"Hadamard", "Hadamard", "Hadamard"},
+                                  {{0}, {1}, {2}}, {false, false, false});
+
+        mps_state_mpo.applyOperations({"Hadamard", "Hadamard", "Hadamard"},
+                                      {{0}, {1}, {2}}, {false, false, false});
+
+        mps_state.applyOperation("SingleExcitation", {0, 2}, false, {0.3});
+
+        mps_state_mpo.applyMPOOperation(mpo_single_excitation, {0, 2},
+                                        max_mpo_bond);
+
+        auto ref = mps_state.getDataVector();
+        auto res = mps_state_mpo.getDataVector();
+
+        CHECK(res == Pennylane::Util::approx(ref));
+    }
+}
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/tests/Tests_MPSTNCuda.cpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/tests/Tests_MPSTNCuda.cpp
index 1cea281878..1a03366669 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/tests/Tests_MPSTNCuda.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/tests/Tests_MPSTNCuda.cpp
@@ -22,6 +22,7 @@
 #include <catch2/catch.hpp>
 
 #include "DevTag.hpp"
+#include "MPOTNCuda.hpp"
 #include "MPSTNCuda.hpp"
 #include "cuda_helpers.hpp"
 
@@ -287,3 +288,39 @@ TEMPLATE_TEST_CASE("MPSTNCuda::getDataVector()", "[MPSTNCuda]", float, double) {
                 "The number of qubits should be greater than 1."));
     }
 }
+
+TEMPLATE_TEST_CASE("MPOTNCuda::getBondDims()", "[MPOTNCuda]", float, double) {
+    using cp_t = std::complex<TestType>;
+    SECTION("Check if bondDims is correctly set") {
+        const std::size_t num_qubits = 3;
+        const std::size_t maxBondDim = 128;
+        const DevTag<int> dev_tag{0, 0};
+
+        MPSTNCuda<TestType> mps{num_qubits, maxBondDim, dev_tag};
+
+        std::vector<std::vector<cp_t>> tensors; //([2,2,3], [3,2,2,3], [3,2,2])
+        const std::vector<std::size_t> wires = {0, 1, 2};
+        const std::size_t maxMPOBondDim = 3;
+
+        tensors.emplace_back(std::vector<cp_t>(12, {0.0, 0.0}));
+        tensors.emplace_back(std::vector<cp_t>(36, {0.0, 0.0}));
+        tensors.emplace_back(std::vector<cp_t>(12, {0.0, 0.0}));
+
+        const auto tensors_const = tensors;
+
+        MPOTNCuda<TestType> mpo{tensors_const,
+                                wires,
+                                maxMPOBondDim,
+                                num_qubits,
+                                mps.getTNCudaHandle(),
+                                mps.getCudaDataType(),
+                                dev_tag};
+
+        auto bondDims = mpo.getBondDims();
+
+        std::vector<std::size_t> expected_bondDims = {maxMPOBondDim,
+                                                      maxMPOBondDim};
+
+        CHECK(bondDims == expected_bondDims);
+    }
+}
\ No newline at end of file
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/CMakeLists.txt
index 559b5ca437..9d956ede67 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/CMakeLists.txt
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/CMakeLists.txt
@@ -7,3 +7,7 @@ target_include_directories(tncuda_utils INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(tncuda_utils INTERFACE lightning_utils lightning_compile_options lightning_external_libs)
 
 set_property(TARGET tncuda_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+if (BUILD_TESTS)
+    add_subdirectory(tests)
+endif()
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/CMakeLists.txt
new file mode 100644
index 0000000000..801e41d6fe
--- /dev/null
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.20)
+
+project(tncuda_utils_tests)
+
+# Default build type for test code is Debug
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Debug)
+endif()
+
+include("${pennylane_lightning_SOURCE_DIR}/cmake/support_tests.cmake")
+FetchAndIncludeCatch()
+
+################################################################################
+# Define library
+################################################################################
+
+add_library(tncuda_utils_tests INTERFACE)
+target_link_libraries(tncuda_utils_tests INTERFACE Catch2::Catch2 tncuda_utils)
+
+ProcessTestOptions(tncuda_utils_tests)
+
+target_sources(tncuda_utils_tests INTERFACE runner_${PL_BACKEND}_tncuda_utils.cpp)
+
+################################################################################
+# Define targets
+################################################################################
+set(TEST_SOURCES    Test_TNCuda_utils.cpp)
+
+add_executable(tncuda_utils_test_runner ${TEST_SOURCES})
+target_link_libraries(tncuda_utils_test_runner PRIVATE tncuda_utils_tests)
+catch_discover_tests(tncuda_utils_test_runner)
+
+install(TARGETS tncuda_utils_test_runner DESTINATION bin)
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/Test_TNCuda_utils.cpp b/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/Test_TNCuda_utils.cpp
new file mode 100644
index 0000000000..63c399b1ba
--- /dev/null
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/Test_TNCuda_utils.cpp
@@ -0,0 +1,86 @@
+// Copyright 2024 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the License);
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <catch2/catch.hpp>
+
+#include "tncuda_helpers.hpp"
+
+/// @cond DEV
+namespace {
+using namespace Pennylane::LightningTensor::TNCuda::Util;
+} // namespace
+/// @endcond
+
+TEST_CASE("swap_op_wires_queue", "[TNCuda_utils]") {
+    SECTION("is_wires_local: true") {
+        std::vector<std::size_t> wires = {0, 1, 2, 3};
+        REQUIRE(is_wires_local(wires) == true);
+    }
+
+    SECTION("is_wires_local: false") {
+        std::vector<std::size_t> wires = {0, 1, 3, 4};
+        REQUIRE(is_wires_local(wires) == false);
+    }
+
+    SECTION("swap_op_wires_queue: local") {
+        std::vector<std::size_t> wires = {0, 1, 2, 3};
+        auto [target_wires, swap_wires_queue] =
+            create_swap_wire_pair_queue(wires);
+        REQUIRE(wires == target_wires);
+        REQUIRE(swap_wires_queue.empty() == true);
+    }
+
+    SECTION("swap_op_wires_queue: non-local [0,1,n_wires-1]") {
+        std::vector<std::size_t> wires = {0, 1, 4};
+
+        std::vector<std::size_t> target_wires_ref = {0, 1, 2};
+        std::vector<std::vector<std::size_t>> swap_wires_queue_ref = {{4, 3},
+                                                                      {3, 2}};
+        auto [local_wires, swap_wires_queue] =
+            create_swap_wire_pair_queue(wires);
+        REQUIRE(local_wires == target_wires_ref);
+        REQUIRE(swap_wires_queue.size() == 1);
+        REQUIRE(swap_wires_queue[0] == swap_wires_queue_ref);
+    }
+
+    SECTION("swap_op_wires_queue: non-local [0,n_wires-2,n_wires-1]") {
+        std::vector<std::size_t> wires = {0, 3, 4};
+
+        std::vector<std::size_t> target_wires_ref = {2, 3, 4};
+        std::vector<std::vector<std::size_t>> swap_wires_queue_ref = {{0, 1},
+                                                                      {1, 2}};
+        auto [local_wires, swap_wires_queue] =
+            create_swap_wire_pair_queue(wires);
+        REQUIRE(local_wires == target_wires_ref);
+        REQUIRE(swap_wires_queue.size() == 1);
+        REQUIRE(swap_wires_queue[0] == swap_wires_queue_ref);
+    }
+
+    SECTION("swap_op_wires_queue: non-local [0,n_wires/2,n_wires-1]") {
+        std::vector<std::size_t> wires = {0, 2, 4};
+        std::vector<std::size_t> target_wires_ref = {1, 2, 3};
+        std::vector<std::vector<std::size_t>> swap_wires_queue_ref0 = {{0, 1}};
+        std::vector<std::vector<std::size_t>> swap_wires_queue_ref1 = {{4, 3}};
+        auto [local_wires, swap_wires_queue] =
+            create_swap_wire_pair_queue(wires);
+        REQUIRE(local_wires == target_wires_ref);
+        REQUIRE(swap_wires_queue.size() == 2);
+        REQUIRE(swap_wires_queue[0] == swap_wires_queue_ref0);
+        REQUIRE(swap_wires_queue[1] == swap_wires_queue_ref1);
+    }
+}
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/runner_lightning_tensor_tncuda_utils.cpp b/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/runner_lightning_tensor_tncuda_utils.cpp
new file mode 100644
index 0000000000..4ed06df1f7
--- /dev/null
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tests/runner_lightning_tensor_tncuda_utils.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include <catch2/catch.hpp>
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tncuda_helpers.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tncuda_helpers.hpp
index b126a63d8d..4fadfadb40 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tncuda_helpers.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/utils/tncuda_utils/tncuda_helpers.hpp
@@ -104,4 +104,94 @@ inline void setWorkSpaceMemory(const cutensornetHandle_t &tncuda_handle,
         /* int64_t */ static_cast<int64_t>(worksize)));
 }
 
+/**
+ * @brief Check if the wires are local.
+ *
+ * @param wires The wires to check.
+ */
+inline bool is_wires_local(const std::vector<std::size_t> &wires) {
+    const std::size_t num_wires = wires.size();
+    for (std::size_t i = 0; i < num_wires - 1; ++i) {
+        if (wires[i + 1] - wires[i] != 1) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/**
+ * @brief Create a queue of swap operations to be performed on the MPS.
+ *
+ * @param wires The target wires.
+ *
+ * @return A tuple containing the local target wires and the swap wire queue.
+ */
+inline auto create_swap_wire_pair_queue(const std::vector<std::size_t> &wires)
+    -> std::tuple<std::vector<std::size_t>,
+                  std::vector<std::vector<std::vector<std::size_t>>>> {
+    PL_ABORT_IF_NOT(std::is_sorted(wires.begin(), wires.end()),
+                    "The wires should be in descending order.");
+
+    std::vector<std::vector<std::vector<std::size_t>>> swap_wires_queue;
+    std::vector<std::size_t> local_wires;
+
+    if (is_wires_local(wires)) {
+        local_wires = wires;
+    } else {
+        const std::size_t num_wires = wires.size();
+
+        const std::size_t fix_wire_pos = num_wires / std::size_t{2U};
+        const std::size_t fixed_gate_wire_idx = wires[fix_wire_pos];
+
+        local_wires.push_back(fixed_gate_wire_idx);
+
+        int32_t left_wire_pos = fix_wire_pos - 1;
+        int32_t right_wire_pos = fix_wire_pos + 1;
+
+        while (left_wire_pos >= 0 ||
+               right_wire_pos < static_cast<int32_t>(num_wires)) {
+            std::vector<std::vector<std::size_t>> local_swap_wires_queue;
+            if (left_wire_pos >= 0) {
+                const std::size_t begin = wires[left_wire_pos];
+                const std::size_t end =
+                    wires[fix_wire_pos] - (fix_wire_pos - left_wire_pos);
+
+                if (begin < end) {
+                    for (std::size_t i = begin; i < end; i++) {
+                        local_swap_wires_queue.emplace_back(
+                            std::vector<std::size_t>{i, i + 1});
+                    }
+                    swap_wires_queue.emplace_back(local_swap_wires_queue);
+                }
+
+                std::size_t left_most_wire = local_wires[0] - 1;
+
+                local_wires.insert(local_wires.begin(), left_most_wire);
+
+                left_wire_pos--;
+            }
+
+            if (right_wire_pos < static_cast<int32_t>(num_wires)) {
+                std::vector<std::vector<std::size_t>> local_swap_wires_queue;
+                const std::size_t begin = wires[right_wire_pos];
+                const std::size_t end =
+                    wires[fix_wire_pos] + (right_wire_pos - fix_wire_pos);
+                if (begin > end) {
+                    for (std::size_t i = begin; i > end; i--) {
+                        local_swap_wires_queue.emplace_back(
+                            std::vector<std::size_t>{i, i - 1});
+                    }
+                    swap_wires_queue.emplace_back(local_swap_wires_queue);
+                }
+
+                std::size_t right_most_wire = local_wires.back() + 1;
+                local_wires.push_back(right_most_wire);
+
+                right_wire_pos++;
+            }
+        }
+    }
+    return {local_wires, swap_wires_queue};
+}
+
 } // namespace Pennylane::LightningTensor::TNCuda::Util
diff --git a/pennylane_lightning/lightning_tensor/_tensornet.py b/pennylane_lightning/lightning_tensor/_tensornet.py
index 31a64c9ace..05849ad4bb 100644
--- a/pennylane_lightning/lightning_tensor/_tensornet.py
+++ b/pennylane_lightning/lightning_tensor/_tensornet.py
@@ -31,44 +31,95 @@
 from pennylane.wires import Wires
 
 
-def svd_split(M, bond_dim):
-    """SVD split a matrix into a matrix product state via numpy linalg. Note that this function is to be moved to the C++ layer."""
-    U, S, Vd = np.linalg.svd(M, full_matrices=False)
-    U = U @ np.diag(S)  # Append singular values to U
+def svd_split(Mat, site_shape, max_bond_dim):
+    """SVD decomposition of a matrix via numpy linalg. Note that this function is to be moved to the C++ layer."""
+    # TODO: Check if cutensornet allows us to remove all zero (or < tol) singular values and the respective rows and columns of U and Vd
+    U, S, Vd = np.linalg.svd(Mat, full_matrices=False)
+    U = U * S  # Append singular values to U
     bonds = len(S)
-    Vd = Vd.reshape(bonds, 2, -1)
-    U = U.reshape((-1, 2, bonds))
+
+    Vd = Vd.reshape([bonds] + site_shape + [-1])
+    U = U.reshape([-1] + site_shape + [bonds])
 
     # keep only chi bonds
-    chi = np.min([bonds, bond_dim])
-    U, S, Vd = U[:, :, :chi], S[:chi], Vd[:chi]
+    chi = min([bonds, max_bond_dim])
+    U, Vd = U[..., :chi], Vd[:chi]
     return U, Vd
 
 
-def dense_to_mps(psi, n_wires, bond_dim):
-    """Convert a dense state vector to a matrix product state."""
+def decompose_dense(psi, n_wires, site_shape, max_bond_dim):
+    """Decompose a dense state vector/gate matrix into MPS/MPO sites."""
     Ms = [[] for _ in range(n_wires)]
+    site_len = np.prod(site_shape)
+    psi = np.reshape(psi, (site_len, -1))  # split psi [2, 2, 2, 2...] to psi [site_len, -1]
 
-    psi = np.reshape(psi, (2, -1))  # split psi[2, 2, 2, 2..] = psi[2, (2x2x2...)]
-    U, Vd = svd_split(psi, bond_dim)  # psi[2, (2x2x..)] = U[2, mu] Vd[mu, (2x2x2x..)]
+    U, Vd = svd_split(
+        psi, site_shape, max_bond_dim
+    )  # psi [site_len, -1] -> U [site_len, mu] Vd [mu, (2x2x2x..)]
 
-    Ms[0] = U
+    Ms[0] = U.reshape(site_shape + [-1])
     bondL = Vd.shape[0]
     psi = Vd
 
     for i in range(1, n_wires - 1):
-        psi = np.reshape(psi, (2 * bondL, -1))  # reshape psi[2 * bondL, (2x2x2...)]
-        U, Vd = svd_split(psi, bond_dim)  # psi[2, (2x2x..)] = U[2, mu] Vd[mu, (2x2x2x..)]
+        psi = np.reshape(psi, (site_len * bondL, -1))  # reshape psi[site_len*bondL, -1]
+        U, Vd = svd_split(
+            psi, site_shape, max_bond_dim
+        )  # psi [site_len*bondL, -1] -> U [site_len, mu] Vd [mu, (2x2x2x..)]
         Ms[i] = U
 
         psi = Vd
         bondL = Vd.shape[0]
 
-    Ms[n_wires - 1] = Vd
+    Ms[-1] = Vd.reshape([-1] + site_shape)
 
     return Ms
 
 
+def gate_matrix_decompose(gate_ops_matrix, wires, max_mpo_bond_dim, c_dtype):
+    """Permute and decompose a gate matrix into MPO sites. This method return the MPO sites in the Fortran order of the ``cutensornet`` backend. Note that MSB in the Pennylane convention is the LSB in the ``cutensornet`` convention."""
+    sorted_indexed_wires = sorted(enumerate(wires), key=lambda x: x[1])
+
+    original_axes, sorted_wires = zip(*sorted_indexed_wires)
+
+    tensor_shape = [2] * len(wires) * 2
+
+    matrix = gate_ops_matrix.astype(c_dtype)
+
+    # Convert the gate matrix to the correct shape and complex dtype
+    gate_tensor = matrix.reshape(tensor_shape)
+
+    # Create the correct order of indices for the gate tensor to be decomposed
+    indices_order = []
+    for i in range(len(wires)):
+        indices_order.extend([original_axes[i], original_axes[i] + len(wires)])
+    # Reverse the indices order to match the target wire order of cutensornet backend
+    indices_order.reverse()
+
+    # Permutation of the gate tensor
+    gate_tensor = np.transpose(gate_tensor, axes=indices_order)
+
+    mpo_site_shape = [2] * 2
+
+    # The indices order of MPOs: 1. left-most site: [ket, bra, bondR]; 2. right-most sites: [bondL, ket, bra]; 3. sites in-between: [bondL, ket, bra, bondR].
+    MPOs = decompose_dense(gate_tensor, len(wires), mpo_site_shape, max_mpo_bond_dim)
+
+    # Convert the MPOs to the correct order for the cutensornet backend
+    mpos = []
+    for index, MPO in enumerate(MPOs):
+        if index == 0:
+            # [ket, bra, bond](0, 1, 2) -> [ket, bond, bra](0, 2, 1) -> Fortran order or reverse indices(1, 2, 0) to match the order requirement of cutensornet backend.
+            mpos.append(np.transpose(MPO, axes=(1, 2, 0)))
+        elif index == len(MPOs) - 1:
+            # [bond, ket, bra](0, 1, 2) -> Fortran order or reverse indices(2, 1, 0) to match the order requirement of cutensornet backend.
+            mpos.append(np.transpose(MPO, axes=(2, 1, 0)))
+        else:
+            # [bondL, ket, bra, bondR](0, 1, 2, 3) -> [bondL, ket, bondR, bra](0, 1, 3, 2) -> Fortran order or reverse indices(2, 3, 1, 0) to match the requirement of cutensornet backend.
+            mpos.append(np.transpose(MPO, axes=(2, 3, 1, 0)))
+
+    return mpos, sorted_wires
+
+
 # pylint: disable=too-many-instance-attributes
 class LightningTensorNet:
     """Lightning tensornet class.
@@ -110,6 +161,8 @@ def __init__(
         if num_wires < 2:
             raise ValueError("Number of wires must be greater than 1.")
 
+        self._wires = Wires(range(num_wires))
+
         self._device_name = device_name
         self._tensornet = self._tensornet_dtype()(self._num_wires, self._max_bond_dim)
 
@@ -195,8 +248,8 @@ def _apply_state_vector(self, state, device_wires: Wires):
         """
 
         state = self._preprocess_state_vector(state, device_wires)
-
-        M = dense_to_mps(state, self._num_wires, self._max_bond_dim)
+        mps_site_shape = [2]
+        M = decompose_dense(state, self._num_wires, mps_site_shape, self._max_bond_dim)
 
         self._tensornet.updateMPSSitesData(M)
 
@@ -222,6 +275,26 @@ def _apply_basis_state(self, state, wires):
 
         self._tensornet.setBasisState(state)
 
+    def _apply_MPO(self, gate_matrix, wires):
+        """Apply a matrix product operator to the quantum state.
+
+        Args:
+            gate_matrix (array[complex/float]): matrix representation of the MPO
+            wires (Wires): wires that the MPO should be applied to
+        Returns:
+            None
+        """
+        # TODO: Discuss if public interface for max_mpo_bond_dim argument
+        max_mpo_bond_dim = 2 ** len(wires)  # Exact SVD decomposition for MPO
+
+        # Get sorted wires and MPO site tensor
+        mpos, sorted_wires = gate_matrix_decompose(
+            gate_matrix, wires, max_mpo_bond_dim, self._c_dtype
+        )
+
+        self._tensornet.applyMPOOperation(mpos, sorted_wires, max_mpo_bond_dim)
+
+    # pylint: disable=too-many-branches
     def _apply_lightning_controlled(self, operation):
         """Apply an arbitrary controlled operation to the state tensor. Note that `cutensornet` only supports controlled gates with a single wire target.
 
@@ -238,20 +311,16 @@ def _apply_lightning_controlled(self, operation):
         control_wires = list(operation.control_wires)
         control_values = operation.control_values
         target_wires = list(operation.target_wires)
-        if method is not None:  # apply n-controlled specialized gate
+
+        if method is not None and basename not in ("GlobalPhase", "MultiRZ"):
             inv = False
             param = operation.parameters
             method(control_wires, control_values, target_wires, inv, param)
         else:  # apply gate as an n-controlled matrix
             method = getattr(tensornet, "applyControlledMatrix")
-            method(
-                qml.matrix(operation.base),
-                control_wires,
-                control_values,
-                target_wires,
-                False,
-            )
+            method(qml.matrix(operation.base), control_wires, control_values, target_wires, False)
 
+    # pylint: disable=too-many-statements
     def _apply_lightning(self, operations):
         """Apply a list of operations to the quantum state.
 
@@ -279,18 +348,32 @@ def _apply_lightning(self, operations):
 
             if isinstance(operation, qml.ops.Controlled) and len(list(operation.target_wires)) == 1:
                 self._apply_lightning_controlled(operation)
-            elif method is not None:  # apply specialized gate
-                param = operation.parameters
-                method(wires, invert_param, param)
-            else:  # apply gate as a matrix
-                # Inverse can be set to False since qml.matrix(operation) is already in
-                # inverted form
+            elif isinstance(operation, qml.GlobalPhase):
+                matrix = np.eye(2) * operation.matrix().flatten()[0]
                 method = getattr(tensornet, "applyMatrix")
+                # GlobalPhase is always applied to the first wire in the tensor network
+                method(matrix, [0], False)
+            elif len(wires) <= 2:
+                if method is not None:
+                    param = operation.parameters
+                    method(wires, invert_param, param)
+                else:
+                    # Inverse can be set to False since qml.matrix(operation) is already in
+                    # inverted form
+                    method = getattr(tensornet, "applyMatrix")
+                    try:
+                        method(qml.matrix(operation), wires, False)
+                    except AttributeError:  # pragma: no cover
+                        # To support older versions of PL
+                        method(operation.matrix(), wires, False)
+            else:
                 try:
-                    method(qml.matrix(operation), wires, False)
+                    gate_ops_matrix = qml.matrix(operation)
                 except AttributeError:  # pragma: no cover
                     # To support older versions of PL
-                    method(operation.matrix, wires, False)
+                    gate_ops_matrix = operation.matrix()
+
+                self._apply_MPO(gate_ops_matrix, wires)
 
     def apply_operations(self, operations):
         """Append operations to the tensor network graph."""
diff --git a/pennylane_lightning/lightning_tensor/lightning_tensor.py b/pennylane_lightning/lightning_tensor/lightning_tensor.py
index 01ec65c5cb..ff53d41ede 100644
--- a/pennylane_lightning/lightning_tensor/lightning_tensor.py
+++ b/pennylane_lightning/lightning_tensor/lightning_tensor.py
@@ -73,12 +73,12 @@
         "BasisState",
         "QubitUnitary",
         "ControlledQubitUnitary",
-        "MultiControlledX",
         "DiagonalQubitUnitary",
         "PauliX",
         "PauliY",
         "PauliZ",
         "Hadamard",
+        "GlobalPhase",
         "S",
         "Adjoint(S)",
         "T",
@@ -99,6 +99,26 @@
         "CZ",
         "PhaseShift",
         "ControlledPhaseShift",
+        "C(Hadamard)",
+        "C(S)",
+        "C(T)",
+        "C(PhaseShift)",
+        "C(RX)",
+        "C(RY)",
+        "C(RZ)",
+        "C(Rot)",
+        "C(IsingXX)",
+        "C(IsingYY)",
+        "C(IsingZZ)",
+        "C(IsingXY)",
+        "C(SingleExcitation)",
+        "C(SingleExcitationPlus)",
+        "C(SingleExcitationMinus)",
+        "C(DoubleExcitation)",
+        "C(DoubleExcitationMinus)",
+        "C(DoubleExcitationPlus)",
+        "C(GlobalPhase)",
+        "C(MultiRZ)",
         "RX",
         "RY",
         "RZ",
@@ -115,11 +135,15 @@
         "SingleExcitationPlus",
         "SingleExcitationMinus",
         "DoubleExcitation",
+        "DoubleExcitationPlus",
+        "DoubleExcitationMinus",
         "QubitCarry",
         "QubitSum",
         "OrbitalRotation",
         "QFT",
         "ECR",
+        "BlockEncode",
+        "C(BlockEncode)",
     }
 )
 
@@ -144,10 +168,17 @@
 
 def stopping_condition(op: Operator) -> bool:
     """A function that determines whether or not an operation is supported by the ``mps`` method of ``lightning.tensor``."""
-    # These thresholds are adapted from `lightning_base.py`
-    # To avoid building matrices beyond the given thresholds.
-    # This should reduce runtime overheads for larger systems.
-    return op.has_matrix and len(op.wires) <= 2 and op.name in _operations
+    # TODOs: These thresholds are from ``lightning.qubit`` and should be adjuested based on the benchmarking tests for the MPS
+    #  simulator (against both max_mps_bond_dim and number of qubits).
+    if isinstance(op, qml.QFT):
+        return len(op.wires) < 10
+    if isinstance(op, qml.GroverOperator):
+        return len(op.wires) < 13
+
+    if isinstance(op, qml.ControlledQubitUnitary):
+        return True
+
+    return op.has_matrix and op.name in _operations
 
 
 def simulate(circuit: QuantumScript, tensornet: LightningTensorNet) -> Result:
diff --git a/tests/lightning_qubit/test_measurements_class.py b/tests/lightning_qubit/test_measurements_class.py
index 2567366393..471fb6de6f 100644
--- a/tests/lightning_qubit/test_measurements_class.py
+++ b/tests/lightning_qubit/test_measurements_class.py
@@ -75,25 +75,6 @@ def obs_not_supported_in_ltensor(obs):
         return False
 
 
-# Ops not supported in lightning.tensor
-def ops_not_supported_in_ltensor(ops):
-    if device_name == "lightning.tensor":
-        unsupported_ops = [qml.MultiRZ, qml.GlobalPhase]
-        if any([ops == op for op in unsupported_ops]):
-            return True
-        return False
-    else:
-        return False
-
-
-def controlled_gate_not_supported_in_ltensor(ops):
-    if device_name == "lightning.tensor":
-        if ops.num_wires > 1:
-            return True
-    else:
-        return False
-
-
 def get_final_state(statevector, tape):
     if device_name == "lightning.tensor":
         return statevector.set_tensor_network(tape)
@@ -677,6 +658,8 @@ def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_s
         assert len(result) == len(expected)
         # a few tests may fail in single precision, and hence we increase the tolerance
         dtol = tol if shots is None else max(tol, 1.0e-2)
+        if device_name == "lightning.tensor" and statevector.dtype == np.complex64:
+            dtol = max(dtol, 1.0e-4)
         # TODO Set better atol and rtol
         for r, e in zip(result, expected):
             if isinstance(shots, tuple) and isinstance(r[0], np.ndarray):
@@ -762,11 +745,6 @@ def test_controlled_qubit_gates(self, operation, n_qubits, control_value, tol, l
         num_wires = max(operation.num_wires, 1)
         np.random.seed(0)
 
-        if ops_not_supported_in_ltensor(operation):
-            pytest.skip("Controlled operation not supported in lightning.tensor.")
-        if controlled_gate_not_supported_in_ltensor(operation):
-            pytest.skip("Controlled operation not supported in lightning.tensor.")
-
         for n_wires in range(num_wires + 1, num_wires + 4):
             wire_lists = list(itertools.permutations(range(0, n_qubits), n_wires))
             n_perms = len(wire_lists) * n_wires
@@ -821,7 +799,7 @@ def test_controlled_qubit_gates(self, operation, n_qubits, control_value, tol, l
                     assert np.allclose(result, expected, tol * 10)
 
     @pytest.mark.skipif(
-        device_name != "lightning.qubit",
+        device_name not in ("lightning.qubit", "lightning.tensor"),
         reason="N-controlled operations only implemented in lightning.qubit.",
     )
     def test_controlled_qubit_unitary_from_op(self, tol, lightning_sv):
@@ -886,10 +864,8 @@ def test_cnot_controlled_qubit_unitary(self, control_wires, target_wires, tol, l
     @pytest.mark.parametrize("n_qubits", list(range(2, 8)))
     def test_controlled_globalphase(self, n_qubits, control_value, tol, lightning_sv):
         """Test that multi-controlled gates are correctly applied to a state"""
-        threshold = 250
+        threshold = 250 if device_name != "lightning.tensor" else 5
         operation = qml.GlobalPhase
-        if ops_not_supported_in_ltensor(operation):
-            pytest.skip("Operation not supported in lightning.tensor.")
         num_wires = max(operation.num_wires, 1)
         for n_wires in range(num_wires + 1, num_wires + 4):
             wire_lists = list(itertools.permutations(range(0, n_qubits), n_wires))
@@ -908,9 +884,11 @@ def test_controlled_globalphase(self, n_qubits, control_value, tol, lightning_sv
                         qml.ctrl(
                             operation(0.1234, target_wires),
                             control_wires,
-                            control_values=[
-                                control_value or bool(i % 2) for i, _ in enumerate(control_wires)
-                            ],
+                            control_values=(
+                                [control_value or bool(i % 2) for i, _ in enumerate(control_wires)]
+                                if device_name != "lightning.tensor"
+                                else [control_value for _ in control_wires]
+                            ),
                         ),
                     ],
                     [qml.state()],
@@ -920,8 +898,10 @@ def test_controlled_globalphase(self, n_qubits, control_value, tol, lightning_sv
                 m = LightningMeasurements(statevector)
                 result = measure_final_state(m, tape)
                 expected = self.calculate_reference(tape)
-
-                assert np.allclose(result, expected, tol)
+                if device_name == "lightning.tensor" and statevector.dtype == np.complex64:
+                    assert np.allclose(result, expected, 1e-4)
+                else:
+                    assert np.allclose(result, expected, tol)
 
 
 @pytest.mark.parametrize("phi", PHI)
diff --git a/tests/lightning_tensor/test_gates_and_expval.py b/tests/lightning_tensor/test_gates_and_expval.py
index e8a73fcb5a..66604c2b05 100644
--- a/tests/lightning_tensor/test_gates_and_expval.py
+++ b/tests/lightning_tensor/test_gates_and_expval.py
@@ -88,34 +88,52 @@ def circuit_ansatz(params, wires):
     qml.SWAP(wires=[wires[2], wires[3]])
     qml.adjoint(qml.ISWAP(wires=[wires[0], wires[1]]))
     qml.ISWAP(wires=[wires[4], wires[5]])
+    qml.ISWAP(wires=[wires[4], wires[6]])
     qml.PSWAP(params[0], wires=[wires[6], wires[7]])
+    qml.PSWAP(params[1], wires=[wires[0], wires[7]])
     qml.adjoint(qml.SISWAP(wires=[wires[0], wires[1]]))
+    qml.adjoint(qml.SISWAP(wires=[wires[0], wires[4]]))
     qml.SISWAP(wires=[wires[4], wires[5]])
+    qml.SISWAP(wires=[wires[2], wires[5]])
     qml.SQISW(wires=[wires[1], wires[0]])
+    qml.SQISW(wires=[wires[5], wires[0]])
     qml.CSWAP(wires=[wires[2], wires[4], wires[5]])
     qml.Toffoli(wires=[wires[0], wires[1], wires[2]])
+    qml.Toffoli(wires=[wires[0], wires[1], wires[5]])
     qml.CY(wires=[wires[0], wires[2]])
     qml.CZ(wires=[wires[1], wires[3]])
-    qml.PhaseShift(params[1], wires=wires[2])
-    qml.ControlledPhaseShift(params[2], wires=[wires[0], wires[5]])
-    qml.RX(params[3], wires=wires[0])
-    qml.RY(params[4], wires=wires[1])
-    qml.RZ(params[5], wires=wires[3])
-    qml.Rot(params[6], params[7], params[8], wires=wires[0])
-    qml.CRX(params[9], wires=[wires[1], wires[0]])
-    qml.CRY(params[10], wires=[wires[3], wires[2]])
-    qml.CRZ(params[11], wires=[wires[2], wires[1]])
-    qml.IsingXX(params[12], wires=[wires[1], wires[0]])
-    qml.IsingYY(params[13], wires=[wires[3], wires[2]])
-    qml.IsingXY(params[14], wires=[wires[2], wires[1]])
-    qml.IsingZZ(params[15], wires=[wires[2], wires[1]])
-    qml.SingleExcitation(params[16], wires=[wires[2], wires[0]])
-    qml.SingleExcitationPlus(params[17], wires=[wires[3], wires[1]])
-    qml.SingleExcitationMinus(params[18], wires=[wires[4], wires[2]])
-    qml.DoubleExcitation(params[19], wires=[wires[0], wires[1], wires[2], wires[3]])
+    qml.PhaseShift(params[2], wires=wires[2])
+    qml.ControlledPhaseShift(params[3], wires=[wires[0], wires[5]])
+    qml.RX(params[4], wires=wires[0])
+    qml.RY(params[5], wires=wires[1])
+    qml.RZ(params[6], wires=wires[3])
+    qml.Rot(params[7], params[8], params[9], wires=wires[0])
+    qml.CRX(params[10], wires=[wires[1], wires[0]])
+    qml.CRY(params[11], wires=[wires[3], wires[2]])
+    qml.CRZ(params[12], wires=[wires[2], wires[1]])
+    qml.CRX(params[13], wires=[wires[1], wires[5]])
+    qml.CRY(params[14], wires=[wires[3], wires[6]])
+    qml.CRZ(params[15], wires=[wires[2], wires[0]])
+    qml.IsingXX(params[16], wires=[wires[1], wires[0]])
+    qml.IsingYY(params[17], wires=[wires[3], wires[2]])
+    qml.IsingXY(params[18], wires=[wires[2], wires[1]])
+    qml.IsingZZ(params[19], wires=[wires[2], wires[1]])
+    qml.IsingXX(params[20], wires=[wires[1], wires[5]])
+    qml.IsingYY(params[21], wires=[wires[3], wires[0]])
+    qml.IsingXY(params[22], wires=[wires[2], wires[4]])
+    qml.IsingZZ(params[23], wires=[wires[2], wires[0]])
+    qml.SingleExcitation(params[24], wires=[wires[2], wires[0]])
+    qml.SingleExcitationPlus(params[25], wires=[wires[3], wires[1]])
+    qml.SingleExcitationMinus(params[26], wires=[wires[4], wires[2]])
+    qml.DoubleExcitation(params[27], wires=[wires[0], wires[1], wires[2], wires[3]])
+    qml.DoubleExcitationPlus(params[28], wires=[wires[1], wires[2], wires[3], wires[4]])
+    qml.DoubleExcitationMinus(params[29], wires=[wires[2], wires[3], wires[4], wires[5]])
+    qml.DoubleExcitation(params[30], wires=[wires[0], wires[2], wires[4], wires[6]])
+    qml.DoubleExcitationPlus(params[31], wires=[wires[0], wires[2], wires[4], wires[6]])
+    qml.DoubleExcitationMinus(params[32], wires=[wires[0], wires[2], wires[4], wires[6]])
     qml.QubitCarry(wires=[wires[0], wires[1], wires[6], wires[7]])
     qml.QubitSum(wires=[wires[2], wires[3], wires[7]])
-    qml.OrbitalRotation(params[20], wires=[wires[0], wires[1], wires[5], wires[6]])
+    qml.OrbitalRotation(params[33], wires=[wires[0], wires[1], wires[5], wires[6]])
     qml.QFT(wires=[wires[0]])
     qml.ECR(wires=[wires[1], wires[3]])
 
@@ -162,14 +180,14 @@ def test_integration_for_all_supported_gates(returns):
     operations"""
     num_wires = 8
     dev_default = qml.device("default.qubit", wires=range(num_wires))
-    dev_ltensor = LightningTensor(wires=range(num_wires), max_bond_dim=16, c_dtype=np.complex128)
+    dev_ltensor = LightningTensor(wires=range(num_wires), max_bond_dim=128, c_dtype=np.complex128)
 
     def circuit(params):
         qml.BasisState(np.array([1, 0, 1, 0, 1, 0, 1, 0]), wires=range(num_wires))
         circuit_ansatz(params, wires=range(num_wires))
         return qml.math.hstack([qml.expval(r) for r in returns])
 
-    n_params = 22
+    n_params = 34
     np.random.seed(1337)
     params_init = np.random.rand(n_params)
 
diff --git a/tests/lightning_tensor/test_tensornet_class.py b/tests/lightning_tensor/test_tensornet_class.py
index a2c8d74a43..638af52e2c 100644
--- a/tests/lightning_tensor/test_tensornet_class.py
+++ b/tests/lightning_tensor/test_tensornet_class.py
@@ -20,13 +20,18 @@
 import numpy as np
 import pennylane as qml
 import pytest
+import scipy
 from conftest import LightningDevice, device_name  # tested device
 from pennylane.wires import Wires
 
 if device_name != "lightning.tensor":
     pytest.skip("Skipping tests for the tensornet class.", allow_module_level=True)
 else:
-    from pennylane_lightning.lightning_tensor._tensornet import LightningTensorNet
+    from pennylane_lightning.lightning_tensor._tensornet import (
+        LightningTensorNet,
+        decompose_dense,
+        gate_matrix_decompose,
+    )
 
 if not LightningDevice._CPP_BINARY_AVAILABLE:  # pylint: disable=protected-access
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
@@ -63,3 +68,53 @@ def test_errors_basis_state():
     with pytest.raises(ValueError, match="State must be of length 1;"):
         tensornet = LightningTensorNet(3, 5)
         tensornet.apply_operations([qml.BasisState(np.array([0, 1]), wires=[0])])
+
+
+def test_dense_decompose():
+    """Test the dense decomposition function."""
+    n_wies = 3
+    site_shape = [2, 2]
+    max_mpo_bond_dim = 128
+
+    hermitian = np.random.rand(2**n_wies, 2**n_wies)
+    hermitian = hermitian @ hermitian.conj().T
+
+    gate = scipy.linalg.expm(1j * hermitian)
+    original_gate = gate.copy()  # for later to double check
+
+    mpos = decompose_dense(gate, n_wies, site_shape, max_mpo_bond_dim)
+
+    # recreate unitary
+    unitary = np.tensordot(mpos[0], mpos[1], axes=([2], [0]))
+    unitary = np.tensordot(unitary, mpos[2], axes=([-1], [0]))
+    unitary = np.reshape(unitary, (2**n_wies, 2**n_wies))
+
+    assert np.allclose(unitary, original_gate, atol=1e-6)
+
+
+def test_gate_matrix_decompose():
+    """Test the gate matrix decomposition function."""
+    wires = [0, 1, 2]
+    hermitian = np.random.rand(2 ** len(wires), 2 ** len(wires))
+    hermitian = hermitian @ hermitian.conj().T
+
+    gate = scipy.linalg.expm(1j * hermitian)
+    original_gate = gate.copy()  # for later to double check
+
+    max_mpo_bond_dim = 2 ** len(wires)
+
+    mpos, sorted_wired = gate_matrix_decompose(gate, wires, max_mpo_bond_dim, np.complex128)
+
+    # restore the C-ordering of the matrices
+    mpo0 = np.transpose(mpos[0], axes=(2, 1, 0))
+    mpo1 = np.transpose(mpos[1], axes=(3, 2, 1, 0))
+    mpo2 = np.transpose(mpos[2], axes=(2, 1, 0))
+
+    # recreate unitary
+    unitary = np.tensordot(mpo0, mpo1, axes=([1], [0]))
+    unitary = np.tensordot(unitary, mpo2, axes=([3], [0]))
+    unitary_f = np.transpose(unitary, axes=(5, 3, 1, 4, 2, 0))
+    unitary_f = np.reshape(unitary_f, (2 ** len(wires), 2 ** len(wires)))
+
+    assert np.allclose(sorted_wired, sorted(wires), atol=1e-6)
+    assert np.allclose(unitary_f, original_gate, atol=1e-6)
diff --git a/tests/test_apply.py b/tests/test_apply.py
index 9e769b8533..b5e7c48bd8 100644
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@@ -1276,10 +1276,6 @@ def circuit():
 
     # Check the BlockEncode PennyLane page for details:
     # https://docs.pennylane.ai/en/stable/code/api/pennylane.BlockEncode.html
-    @pytest.mark.skipif(
-        device_name == "lightning.tensor",
-        reason="lightning.tensor does not support qml.BlockEncode",
-    )
     @pytest.mark.parametrize(
         "op, op_wires",
         [
diff --git a/tests/test_execute.py b/tests/test_execute.py
index 02a6bfa1f6..4e2eb03327 100644
--- a/tests/test_execute.py
+++ b/tests/test_execute.py
@@ -25,10 +25,6 @@
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
 
-@pytest.mark.skipif(
-    device_name == "lightning.tensor",
-    reason="lightning.tensor does not support gates with more than 2 wires, preprocess is required for the following tests",
-)
 @pytest.mark.usefixtures("use_legacy_and_new_opmath")
 @pytest.mark.parametrize("diff_method", ("param_shift", "finite_diff"))
 class TestQChem:
@@ -37,6 +33,9 @@ class TestQChem:
     def test_VQE_gradients(self, diff_method, tol):
         """Test if the VQE procedure returns the expected gradients."""
 
+        if qml.operation.active_new_opmath() and device_name == "lightning.tensor":
+            pytest.skip("The new operation math is not yet fully supported for lightning.tensor")
+
         symbols = ["H", "H"]
 
         geometry = np.array(
diff --git a/tests/test_gates.py b/tests/test_gates.py
index 99b2f75594..da414c3789 100644
--- a/tests/test_gates.py
+++ b/tests/test_gates.py
@@ -168,11 +168,6 @@ def test_inverse_unitary_correct(op, op_name):
     if wires == 1 and device_name == "lightning.tensor":
         pytest.skip("Skipping single wire device on lightning.tensor.")
 
-    if op_name == "QubitUnitary" and device_name == "lightning.tensor":
-        pytest.skip(
-            "Skipping QubitUnitary on lightning.tensor. It can't be decomposed into 1-wire or 2-wire gates"
-        )
-
     dev = qml.device(device_name, wires=wires)
 
     @qml.qnode(dev)
@@ -312,7 +307,7 @@ def circuit():
 
 
 @pytest.mark.parametrize("theta,phi", list(zip(THETA, PHI)))
-@pytest.mark.parametrize("n_wires", range(1, 7) if device_name != "lightning.tensor" else [1, 2])
+@pytest.mark.parametrize("n_wires", range(1, 7))
 def test_qubit_unitary(n_wires, theta, phi, tol):
     """Test that Hadamard expectation value is correct"""
     n_qubits = 10
@@ -460,10 +455,6 @@ def test_controlled_qubit_gates(operation, n_qubits, control_value, tol):
     dev = qml.device(device_name, wires=n_qubits)
     threshold = 5 if device_name == "lightning.tensor" else 250
     num_wires = max(operation.num_wires, 1)
-    if operation == qml.GlobalPhase and device_name == "lightning.tensor":
-        pytest.skip("GlobalPhase not implemented in lightning.tensor.")
-    if num_wires != 1 and device_name == "lightning.tensor":
-        pytest.skip("Multi-target wire controlled gates not implemented in lightning.tensor.")
 
     for n_wires in range(num_wires + 1, num_wires + 4):
         wire_lists = list(itertools.permutations(range(0, n_qubits), n_wires))
@@ -482,17 +473,21 @@ def circuit():
                     qml.ctrl(
                         operation(target_wires),
                         control_wires,
-                        control_values=[
-                            control_value or bool(i % 2) for i, _ in enumerate(control_wires)
-                        ],
+                        control_values=(
+                            [control_value or bool(i % 2) for i, _ in enumerate(control_wires)]
+                            if device_name != "lightning.tensor"
+                            else [control_value for _ in control_wires]
+                        ),
                     )
                 else:
                     qml.ctrl(
                         operation(*tuple([0.1234] * operation.num_params), target_wires),
                         control_wires,
-                        control_values=[
-                            control_value or bool(i % 2) for i, _ in enumerate(control_wires)
-                        ],
+                        control_values=(
+                            [control_value or bool(i % 2) for i, _ in enumerate(control_wires)]
+                            if device_name != "lightning.tensor"
+                            else [control_value for _ in control_wires]
+                        ),
                     )
                 return qml.state()
 
@@ -563,7 +558,7 @@ def test_paulirot(n_wires, n_targets, tol):
 
 
 @pytest.mark.skipif(
-    device_name != "lightning.qubit",
+    device_name not in ("lightning.qubit", "lightning.tensor"),
     reason="N-controlled operations only implemented in lightning.qubit.",
 )
 @pytest.mark.parametrize("control_wires", range(4))
@@ -593,13 +588,9 @@ def cnot_circuit():
 
     circ = qml.QNode(circuit, dev)
     circ_def = qml.QNode(cnot_circuit, dev)
-    assert np.allclose(circ(), circ_def(), tol)
+    assert np.allclose(circ(), circ_def(), atol=1e-4)
 
 
-@pytest.mark.skipif(
-    device_name == "lightning.tensor",
-    reason="lightning.tensor does not support controlled globalphase gate.",
-)
 @pytest.mark.parametrize("control_value", [False, True])
 @pytest.mark.parametrize("n_qubits", list(range(2, 8)))
 def test_controlled_globalphase(n_qubits, control_value, tol):
@@ -625,9 +616,11 @@ def circuit():
                 qml.ctrl(
                     operation(0.1234, target_wires),
                     control_wires,
-                    control_values=[
-                        control_value or bool(i % 2) for i, _ in enumerate(control_wires)
-                    ],
+                    control_values=(
+                        [control_value or bool(i % 2) for i, _ in enumerate(control_wires)]
+                        if device_name != "lightning.tensor"
+                        else [control_value for _ in control_wires]
+                    ),
                 )
                 return qml.state()
 
diff --git a/tests/test_measurements.py b/tests/test_measurements.py
index d892fd50b9..211a8c134b 100644
--- a/tests/test_measurements.py
+++ b/tests/test_measurements.py
@@ -151,7 +151,7 @@ def circuit():
             _ = circuit()
 
     @pytest.mark.skipif(
-        device_name == "lightning.gpu" or device_name == "lightning.tensor",
+        device_name in ("lightning.gpu", "lightning.tensor"),
         reason="lightning.gpu/lightning.tensor does not support out of order prob.",
     )
     @pytest.mark.parametrize(
diff --git a/tests/test_templates.py b/tests/test_templates.py
index 0c8ff9ca82..a0b6913b69 100644
--- a/tests/test_templates.py
+++ b/tests/test_templates.py
@@ -27,12 +27,17 @@
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
 
+def lightning_tensor_check(n_qubits):
+    if device_name == "lightning.tensor" and n_qubits > 14:
+        pytest.xfail(
+            "Inexact calculation for lightning.tensor with n_qubits > 14 since the default max mps bond dim is 2^7."
+        )
+
+
 class TestGrover:
     """Test Grover's algorithm (multi-controlled gates, decomposition, etc.)"""
 
-    @pytest.mark.parametrize(
-        "n_qubits", range(4, 8) if device_name != "lightning.tensor" else range(4, 6)
-    )
+    @pytest.mark.parametrize("n_qubits", range(4, 8))
     def test_grover(self, n_qubits):
         np.random.seed(42)
         omega = np.random.rand(n_qubits) > 0.5
@@ -64,10 +69,6 @@ def circuit(omega):
         assert np.allclose(np.sum(prob), 1.0)
         assert prob[index] > 0.95
 
-    @pytest.mark.skipif(
-        device_name == "lightning.tensor",
-        reason="lightning.tensor does not have full support of multi-controlled gates.",
-    )
     @pytest.mark.skipif(not LightningDevice._new_API, reason="New API required.")
     @pytest.mark.parametrize("wires", [5, 10, 13, 15])
     def test_preprocess_grover_operator_decomposition(self, wires):
@@ -177,11 +178,9 @@ def circuit(feature_vector):
 class TestIQPEmbedding:
     """Test the IQPEmbedding algorithm."""
 
-    @pytest.mark.skipif(
-        device_name == "lightning.tensor", reason="lightning.tensor does not support MultiRZ"
-    )
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_iqpembedding(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit")
 
@@ -200,11 +199,9 @@ def circuit(feature_vector):
 class TestQAOAEmbedding:
     """Test the QAOAEmbedding algorithm."""
 
-    @pytest.mark.skipif(
-        device_name == "lightning.tensor", reason="lightning.tensor does not support MultiRZ"
-    )
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_qaoaembedding(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit")
 
@@ -245,6 +242,7 @@ class TestRandomLayers:
 
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_randomlayers(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit", wires=n_qubits)
 
@@ -265,6 +263,7 @@ class TestStronglyEntanglingLayers:
 
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_stronglyentanglinglayers(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit")
 
@@ -286,6 +285,7 @@ class TestSimplifiedTwoDesign:
 
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_simplifiedtwodesign(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit")
 
@@ -309,6 +309,7 @@ class TestBasicEntanglerLayers:
 
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_basicentanglerlayers(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit")
 
@@ -327,10 +328,6 @@ def circuit(weights):
 class TestMottonenStatePreparation:
     """Test the MottonenStatePreparation algorithm."""
 
-    @pytest.mark.skipif(
-        device_name == "lightning.tensor",
-        reason="lightning.tensor does not support GlobalPhase and 2+ wires gates.",
-    )
     @pytest.mark.parametrize("n_qubits", range(2, 6, 2))
     def test_mottonenstatepreparation(self, n_qubits):
         dev = qml.device(device_name, wires=n_qubits)
@@ -352,10 +349,6 @@ def circuit(state):
 class TestArbitraryStatePreparation:
     """Test the ArbitraryStatePreparation algorithm."""
 
-    @pytest.mark.skipif(
-        device_name == "lightning.tensor",
-        reason="lightning.tensor does not support MultiRZ.",
-    )
     @pytest.mark.parametrize("n_qubits", range(2, 6, 2))
     def test_arbitrarystatepreparation(self, n_qubits):
         dev = qml.device(device_name, wires=n_qubits)
@@ -376,10 +369,6 @@ def circuit(weights):
 class TestCosineWindow:
     """Test the CosineWindow algorithm."""
 
-    @pytest.mark.skipif(
-        device_name == "lightning.tensor",
-        reason="lightning.tensor does not support 2+ wires gates that can't be decomposed into 1,2 wires gates.",
-    )
     @pytest.mark.parametrize("n_qubits", range(2, 6, 2))
     def test_cosinewindow(self, n_qubits):
         dev = qml.device(device_name, wires=n_qubits)
@@ -637,6 +626,7 @@ class TestApproxTimeEvolution:
 
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_approxtimeevolution(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit")
 
@@ -659,6 +649,7 @@ class TestQDrift:
 
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_qdrift(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit", wires=n_qubits)
 
@@ -681,6 +672,7 @@ class TestTrotterProduct:
 
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_trotterproduct(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit")
 
@@ -703,6 +695,7 @@ class TestQuantumPhaseEstimation:
 
     @pytest.mark.parametrize("n_qubits", range(2, 12, 2))
     def test_quantumphaseestimation(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         phase = 5
         target_wires = [0]
         unitary = qml.RX(phase, wires=0).matrix()
@@ -735,6 +728,7 @@ class TestQFT:
 
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_qft(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit")
 
@@ -792,12 +786,9 @@ def circuit(basis_state):
 class TestQSVT:
     """Test the QSVT algorithm."""
 
-    @pytest.mark.skipif(
-        device_name == "lightning.tensor",
-        reason="lightning.tensor does not support BlockEncode",
-    )
     @pytest.mark.parametrize("n_qubits", range(2, 20, 2))
     def test_qsvt(self, n_qubits):
+        lightning_tensor_check(n_qubits)
         dev = qml.device(device_name, wires=n_qubits)
         dq = qml.device("default.qubit")
         A = np.array([[0.1]])

From 5fba41b0f936378eb5c251aa28cbad3e1bf974dc Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Wed, 18 Sep 2024 19:52:15 +0000
Subject: [PATCH 3/5] Auto update version from '0.39.0-dev26' to '0.39.0-dev27'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index ebebbc9f97..bb3de8f43b 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev26"
+__version__ = "0.39.0-dev27"

From a5e4dd43bf88bb1fea23013af7c29bce3b96276d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Sat, 21 Sep 2024 15:17:43 -0400
Subject: [PATCH 4/5] Add support for multi-GPU state-vector (#914)

### Before submitting

Please complete the following checklist when submitting a PR:

- [X] All new features must include a unit test.
If you've fixed a bug or added code that should be tested, add a test to
the
      [`tests`](../tests) directory!

- [X] All new functions and code must be clearly commented and
documented.
If you do make documentation changes, make sure that the docs build and
      render correctly by running `make docs`.

- [ ] Ensure that the test suite passes, by running `make test`.

- [ ] Add a new entry to the `.github/CHANGELOG.md` file, summarizing
the
      change, and including a link back to the PR.

- [X] Ensure that code is properly formatted by running `make format`.

When all the above are checked, delete everything above the dashed
line and fill in the pull request template.


------------------------------------------------------------------------------------------------------------

**Context:**
 Migrate LightningGPU to the new device API

**Description of the Change:**
Adding support for multi-GPU just for the state vector class.

In following PR will complete the full support of multi-GPUs

**Benefits:**
 Integration of LGPU with MPI in the new device API

**Possible Drawbacks:**

**Related GitHub Issues:**
## **Freezzed PR** :warning: :snowflake:
To make a smooth integration of LightningGPU with the new device API, we
set the branch `gpuNewAPI_backend` as the base branch target for future
developments related to this big task.

The branch `gpuNewAPI_backend` has the mock of all classes and methods
necessary for the new API. Also, several tests were disabled with
``` python
if device_name == "lightning.gpu":
    pytest.skip("LGPU new API in WIP.  Skipping.",allow_module_level=True)
```
However, these tests will unblocked as the implementation progresses.

After all the developments for integrating LightningGPU with the new API
have been completed then the PR will be open to merge to `master`

[sc-70960]

---------

Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com>
---
 Makefile                                      |  27 +++-
 mpitests/conftest.py                          |   7 +
 mpitests/test_adjoint_jacobian.py             |   3 +
 mpitests/test_apply.py                        | 134 ++++--------------
 mpitests/test_device.py                       |  10 +-
 mpitests/test_expval.py                       |   3 +
 mpitests/test_measurements_sparse.py          |   3 +
 mpitests/test_probs.py                        |   3 +
 .../lightning_gpu/StateVectorCudaMPI.hpp      |  59 ++++++--
 .../lightning_gpu/bindings/LGPUBindings.hpp   |   8 --
 .../bindings/LGPUBindingsMPI.hpp              |   8 ++
 .../tests/mpi/Test_StateVectorCudaMPI.cpp     |  19 +++
 .../utils/cuStateVec_helpers.hpp              |  17 +++
 .../lightning_gpu/_measurements.py            |  13 +-
 .../lightning_gpu/_mpi_handler.py             |   8 +-
 .../lightning_gpu/_state_vector.py            |  15 +-
 .../lightning_gpu/lightning_gpu.py            |  10 +-
 .../lightning_kokkos/_state_vector.py         |   2 +
 .../lightning_qubit/_state_vector.py          |   2 +
 19 files changed, 197 insertions(+), 154 deletions(-)

diff --git a/Makefile b/Makefile
index c9c454bb64..4de5ca774f 100644
--- a/Makefile
+++ b/Makefile
@@ -35,9 +35,11 @@ help:
 	@echo "  test-cpp [verbose=1]     to run the C++ test suite (requires CMake)"
 	@echo "                           use with 'verbose=1' for building with verbose flag"
 	@echo "  test-cpp [target=?]      to run a specific C++ test target (requires CMake)."
+	@echo "  test-cpp-mpi [backend=?] to run the C++ test suite with MPI (requires CMake and MPI)"
+	@echo "                           Default: lightning_gpu"
 	@echo "  test-python [device=?]   to run the Python test suite"
 	@echo "                           Default: lightning.qubit"
-	@echo "  wheel [backend=?]        to configure and build Python wheels
+	@echo "  wheel [backend=?]        to configure and build Python wheels"
 	@echo "                           Default: lightning_qubit"
 	@echo "  coverage [device=?]      to generate a coverage report for python interface"
 	@echo "                           Default: lightning.qubit"
@@ -98,7 +100,7 @@ coverage-cpp:
 	lcov --directory . -b ../pennylane_lightning/core/src/ --capture --output-file coverage.info; \
 	genhtml coverage.info --output-directory out
 
-.PHONY: test-python test-builtin test-suite test-cpp
+.PHONY: test-python test-builtin test-suite test-cpp test-cpp-mpi
 test-python: test-builtin test-suite
 
 test-builtin:
@@ -124,6 +126,27 @@ else
 	cmake --build ./BuildTests $(VERBOSE) --target test
 endif
 
+test-cpp-mpi:
+	rm -rf ./BuildTests
+	cmake -BBuildTests -G Ninja \
+		  -DCMAKE_BUILD_TYPE=Debug \
+		  -DBUILD_TESTS=ON \
+		  -DENABLE_WARNINGS=ON \
+		  -DPL_BACKEND=lightning_gpu \
+		  -DENABLE_MPI=ON \
+		  $(OPTIONS)
+ifdef target
+	cmake --build ./BuildTests $(VERBOSE) --target $(target)
+	mpi -np 2 ./BuildTests/$(target)
+else
+	cmake --build ./BuildTests $(VERBOSE)
+	for file in ./BuildTests/*_test_runner_mpi; do \
+		echo "Running $$file"; \
+		mpirun -np 2 $$file ; \
+	done
+endif
+
+
 .PHONY: format format-cpp format-python
 format: format-cpp format-python
 
diff --git a/mpitests/conftest.py b/mpitests/conftest.py
index a2084f2a5d..552cf9f330 100644
--- a/mpitests/conftest.py
+++ b/mpitests/conftest.py
@@ -98,6 +98,13 @@ def get_device():
 # Device specification
 if device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu import LightningGPU as LightningDevice
+    from pennylane_lightning.lightning_gpu._measurements import (
+        LightningGPUMeasurements as LightningMeasurements,
+    )
+    from pennylane_lightning.lightning_gpu._state_vector import (
+        LightningGPUStateVector as LightningStateVector,
+    )
+
 else:
     raise qml.DeviceError(f"The MPI tests do not apply to the {device_name} device.")
 
diff --git a/mpitests/test_adjoint_jacobian.py b/mpitests/test_adjoint_jacobian.py
index 6f3b5c7f5b..2100c8be69 100644
--- a/mpitests/test_adjoint_jacobian.py
+++ b/mpitests/test_adjoint_jacobian.py
@@ -31,6 +31,9 @@
 if not ld._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
+if device_name == "lightning.gpu":
+    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
+
 I, X, Y, Z = (
     np.eye(2),
     qml.PauliX.compute_matrix(),
diff --git a/mpitests/test_apply.py b/mpitests/test_apply.py
index 17d91cd2d7..3ae4463092 100644
--- a/mpitests/test_apply.py
+++ b/mpitests/test_apply.py
@@ -34,9 +34,12 @@
 )
 
 
-def create_random_init_state(numWires, R_DTYPE, seed_value=48):
+def create_random_init_state(numWires, C_DTYPE, seed_value=48):
     """Returns a random initial state of a certain type."""
     np.random.seed(seed_value)
+
+    R_DTYPE = np.float64 if C_DTYPE == np.complex128 else np.float32
+
     num_elements = 1 << numWires
     init_state = np.random.rand(num_elements).astype(R_DTYPE) + 1j * np.random.rand(
         num_elements
@@ -54,16 +57,13 @@ def apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires):
     num_global_wires = commSize.bit_length() - 1
     num_local_wires = num_wires - num_global_wires
 
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
+    c_dtype = dev_mpi.c_dtype
 
     expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
     local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
     local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
 
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
     comm.Bcast(state_vector, root=0)
 
     comm.Scatter(state_vector, local_state_vector, root=0)
@@ -84,45 +84,6 @@ def circuit(*params):
     assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
 
 
-def apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires):
-    """Wrapper applying a parametric gate with the apply method."""
-    num_wires = numQubits
-    comm = MPI.COMM_WORLD
-    commSize = comm.Get_size()
-    num_global_wires = commSize.bit_length() - 1
-    num_local_wires = num_wires - num_global_wires
-
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
-
-    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
-
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
-    comm.Bcast(state_vector, root=0)
-
-    comm.Scatter(state_vector, local_state_vector, root=0)
-    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
-
-    @qml.qnode(dev_cpu)
-    def circuit(*params):
-        qml.StatePrep(state_vector, wires=range(num_wires))
-        operation(*params, wires=Wires)
-        return qml.state()
-
-    expected_output_cpu = np.array(circuit(*par)).astype(c_dtype)
-    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
-
-    dev_mpi.syncH2D(local_state_vector)
-    dev_mpi.apply([operation(*par, wires=Wires)])
-    dev_mpi.syncD2H(local_state_vector)
-
-    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
-
-
 def apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires):
     """Wrapper applying a non-parametric gate with QNode function."""
     num_wires = numQubits
@@ -131,16 +92,13 @@ def apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires):
     num_global_wires = commSize.bit_length() - 1
     num_local_wires = num_wires - num_global_wires
 
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
+    c_dtype = dev_mpi.c_dtype
 
     expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
     local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
     local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
 
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+    state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
     comm.Bcast(state_vector, root=0)
 
     comm.Scatter(state_vector, local_state_vector, root=0)
@@ -161,45 +119,6 @@ def circuit():
     assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
 
 
-def apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires):
-    """Wrapper applying a non-parametric gate with the apply method."""
-    num_wires = numQubits
-    comm = MPI.COMM_WORLD
-    commSize = comm.Get_size()
-    num_global_wires = commSize.bit_length() - 1
-    num_local_wires = num_wires - num_global_wires
-
-    if dev_mpi.R_DTYPE == np.float32:
-        c_dtype = np.complex64
-    else:
-        c_dtype = np.complex128
-
-    expected_output_cpu = np.zeros(1 << num_wires).astype(c_dtype)
-    local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
-    local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
-
-    state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
-    comm.Bcast(state_vector, root=0)
-
-    comm.Scatter(state_vector, local_state_vector, root=0)
-    dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
-
-    @qml.qnode(dev_cpu)
-    def circuit():
-        qml.StatePrep(state_vector, wires=range(num_wires))
-        operation(wires=Wires)
-        return qml.state()
-
-    expected_output_cpu = np.array(circuit()).astype(c_dtype)
-    comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
-
-    dev_mpi.syncH2D(local_state_vector)
-    dev_mpi.apply([operation(wires=Wires)])
-    dev_mpi.syncD2H(local_state_vector)
-
-    assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
-
-
 class TestApply:  # pylint: disable=missing-function-docstring,too-many-arguments
     """Tests whether the device can apply supported quantum gates."""
 
@@ -220,13 +139,11 @@ def dev_mpi(self, request):
     @pytest.mark.parametrize("Wires", [0, 1, numQubits - 2, numQubits - 1])
     def test_apply_operation_single_wire_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CNOT, qml.SWAP, qml.CY, qml.CZ])
     @pytest.mark.parametrize("Wires", [[0, 1], [numQubits - 2, numQubits - 1], [0, numQubits - 1]])
     def test_apply_operation_two_wire_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
     @pytest.mark.parametrize(
@@ -240,7 +157,6 @@ def test_apply_operation_two_wire_nonparam(self, tol, operation, Wires, dev_mpi)
     )
     def test_apply_operation_three_wire_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CSWAP, qml.Toffoli])
     @pytest.mark.parametrize(
@@ -254,7 +170,6 @@ def test_apply_operation_three_wire_nonparam(self, tol, operation, Wires, dev_mp
     )
     def test_apply_operation_three_wire_qnode_nonparam(self, tol, operation, Wires, dev_mpi):
         apply_operation_gates_qnode_nonparam(tol, dev_mpi, operation, Wires)
-        apply_operation_gates_apply_nonparam(tol, dev_mpi, operation, Wires)
 
     @pytest.mark.parametrize("operation", [qml.PhaseShift, qml.RX, qml.RY, qml.RZ])
     @pytest.mark.parametrize("par", [[0.1], [0.2], [0.3]])
@@ -263,7 +178,6 @@ def test_apply_operation_1gatequbit_1param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize("operation", [qml.Rot])
     @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
@@ -272,7 +186,6 @@ def test_apply_operation_1gatequbit_3param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize("operation", [qml.CRot])
     @pytest.mark.parametrize("par", [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]])
@@ -281,7 +194,6 @@ def test_apply_operation_1gatequbit_3param_cgate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize(
         "operation",
@@ -304,7 +216,6 @@ def test_apply_operation_2gatequbit_1param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     @pytest.mark.parametrize(
         "operation",
@@ -323,7 +234,6 @@ def test_apply_operation_4gatequbit_1param_gate_qnode_param(
         self, tol, operation, par, Wires, dev_mpi
     ):
         apply_operation_gates_qnode_param(tol, dev_mpi, operation, par, Wires)
-        apply_operation_gates_apply_param(tol, dev_mpi, operation, par, Wires)
 
     # BasisState test
     @pytest.mark.parametrize("operation", [qml.BasisState])
@@ -337,7 +247,7 @@ def test_state_prep(self, tol, operation, index, dev_mpi):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        if dev_mpi.R_DTYPE == np.float32:
+        if dev_mpi.c_dtype == np.float32:
             c_dtype = np.complex64
         else:
             c_dtype = np.complex128
@@ -347,7 +257,7 @@ def test_state_prep(self, tol, operation, index, dev_mpi):
         local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
         local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
 
         comm.Scatter(state_vector, local_state_vector, root=0)
         dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
@@ -399,7 +309,7 @@ def test_qubit_state_prep(self, tol, par, Wires, dev_mpi):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        if dev_mpi.R_DTYPE == np.float32:
+        if dev_mpi.c_dtype == np.float32:
             c_dtype = np.complex64
         else:
             c_dtype = np.complex128
@@ -409,7 +319,7 @@ def test_qubit_state_prep(self, tol, par, Wires, dev_mpi):
         local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
         local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
 
         comm.Scatter(state_vector, local_state_vector, root=0)
         dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
@@ -435,7 +345,7 @@ def test_dev_reset(self, tol, dev_mpi):
         num_global_wires = commSize.bit_length() - 1
         num_local_wires = num_wires - num_global_wires
 
-        if dev_mpi.R_DTYPE == np.float32:
+        if dev_mpi.c_dtype == np.float32:
             c_dtype = np.complex64
         else:
             c_dtype = np.complex128
@@ -445,7 +355,7 @@ def test_dev_reset(self, tol, dev_mpi):
         local_state_vector = np.zeros(1 << num_local_wires).astype(c_dtype)
         local_expected_output_cpu = np.zeros(1 << num_local_wires).astype(c_dtype)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
 
         comm.Scatter(state_vector, local_state_vector, root=0)
         dev_cpu = qml.device("lightning.qubit", wires=num_wires, c_dtype=c_dtype)
@@ -462,15 +372,19 @@ def circuit():
         expected_output_cpu = cpu_qnode().astype(c_dtype)
         comm.Scatter(expected_output_cpu, local_expected_output_cpu, root=0)
 
-        dev_mpi.reset()
+        dev_mpi._statevector.reset_state(False)
 
         gpumpi_qnode = qml.QNode(circuit, dev_mpi)
-        dev_mpi.reset()
+        dev_mpi._statevector.reset_state(False)
 
         local_state_vector = gpumpi_qnode()
         assert np.allclose(local_state_vector, local_expected_output_cpu, atol=tol, rtol=0)
 
 
+@pytest.mark.skipif(
+    device_name == "lightning.gpu",
+    reason="LGPU new API in WIP.  Skipping.",
+)
 class TestSparseHamExpval:  # pylint: disable=too-few-public-methods,missing-function-docstring
     """Tests sparse hamiltonian expectation values."""
 
@@ -518,6 +432,10 @@ def test_sparse_hamiltonian_expectation(self, C_DTYPE):
         assert np.allclose(res, expected)
 
 
+@pytest.mark.skipif(
+    device_name == "lightning.gpu",
+    reason="LGPU new API in WIP.  Skipping.",
+)
 class TestExpval:
     """Tests that expectation values are properly calculated or that the proper errors are raised."""
 
@@ -543,7 +461,7 @@ def test_expval_single_wire_no_parameters(self, tol, operation, wires, C_DTYPE):
 
         dev_mpi = qml.device("lightning.gpu", wires=numQubits, mpi=True, c_dtype=C_DTYPE)
 
-        state_vector = create_random_init_state(num_wires, dev_mpi.R_DTYPE)
+        state_vector = create_random_init_state(num_wires, dev_mpi.c_dtype)
         comm.Bcast(state_vector, root=0)
 
         local_state_vector = np.zeros(1 << num_local_wires).astype(C_DTYPE)
@@ -726,7 +644,7 @@ def test_sample_values_qnode(self, tol, C_DTYPE):
         dev_mpi = qml.device(
             "lightning.gpu", wires=num_wires, mpi=True, shots=1000, c_dtype=C_DTYPE
         )
-        dev_mpi.reset()
+        dev_mpi._statevector.reset_state(False)
 
         @qml.qnode(dev_mpi)
         def circuit():
diff --git a/mpitests/test_device.py b/mpitests/test_device.py
index 03a1880114..dd783dbee7 100644
--- a/mpitests/test_device.py
+++ b/mpitests/test_device.py
@@ -38,13 +38,13 @@ def test_create_device():
 
 
 def test_unsupported_mpi_buf_size():
-    with pytest.raises(TypeError, match="Unsupported mpi_buf_size value"):
+    with pytest.raises(ValueError, match="Unsupported mpi_buf_size value"):
         dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=-1)
-    with pytest.raises(TypeError, match="Unsupported mpi_buf_size value"):
+    with pytest.raises(ValueError, match="Unsupported mpi_buf_size value"):
         dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=3)
-    with pytest.warns(
-        RuntimeWarning,
-        match="The MPI buffer size is larger than the local state vector size",
+    with pytest.raises(
+        RuntimeError,
+        match="The MPI buffer size is larger than the local state vector size.",
     ):
         dev = qml.device(device_name, mpi=True, wires=4, mpi_buf_size=2**4)
     with pytest.raises(
diff --git a/mpitests/test_expval.py b/mpitests/test_expval.py
index d020471c03..68f394d49a 100644
--- a/mpitests/test_expval.py
+++ b/mpitests/test_expval.py
@@ -22,6 +22,9 @@
 from conftest import PHI, THETA, VARPHI, device_name
 from mpi4py import MPI
 
+if device_name == "lightning.gpu":
+    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
+
 
 @pytest.mark.parametrize("theta, phi", list(zip(THETA, PHI)))
 class TestExpval:
diff --git a/mpitests/test_measurements_sparse.py b/mpitests/test_measurements_sparse.py
index 7ca88867eb..ee45e0009c 100644
--- a/mpitests/test_measurements_sparse.py
+++ b/mpitests/test_measurements_sparse.py
@@ -27,6 +27,9 @@
 if not ld._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
+if device_name == "lightning.gpu":
+    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
+
 
 class TestSparseExpval:
     """Tests for the expval function"""
diff --git a/mpitests/test_probs.py b/mpitests/test_probs.py
index b2f57f733a..eea9898e12 100644
--- a/mpitests/test_probs.py
+++ b/mpitests/test_probs.py
@@ -20,6 +20,9 @@
 # pylint: disable=missing-function-docstring,unnecessary-comprehension,too-many-arguments,wrong-import-order,unused-variable,c-extension-no-member
 from mpi4py import MPI
 
+if device_name == "lightning.gpu":
+    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
+
 numQubits = 8
 
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
index 3753f792fd..d7b6e7ef36 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaMPI.hpp
@@ -260,13 +260,12 @@ class StateVectorCudaMPI final
      */
     void setBasisState(const std::complex<Precision> &value,
                        const std::size_t index, const bool async = false) {
-        std::size_t rankId = index >> BaseType::getNumQubits();
 
-        std::size_t local_index =
-            static_cast<std::size_t>(
-                rankId * std::pow(2.0, static_cast<long double>(
-                                           BaseType::getNumQubits()))) ^
-            index;
+        const std::size_t rankId = index >> this->getNumLocalQubits();
+
+        const std::size_t local_index =
+            compute_local_index(index, this->getNumLocalQubits());
+
         BaseType::getDataBuffer().zeroInit();
 
         CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
@@ -280,6 +279,46 @@ class StateVectorCudaMPI final
         mpi_manager_.Barrier();
     }
 
+    /**
+     * @brief Prepare a single computational basis state.
+     *
+     * @param state Binary number representing the index
+     * @param wires Wires.
+     * @param use_async Use an asynchronous memory copy.
+     */
+    void setBasisState(const std::vector<std::size_t> &state,
+                       const std::vector<std::size_t> &wires,
+                       const bool use_async) {
+        // This is not functional yet.
+        PL_ABORT_IF_NOT(state.size() == wires.size(),
+                        "state and wires must have equal dimensions.");
+
+        const auto n_wires = this->getTotalNumQubits();
+
+        std::size_t index{0U};
+        for (std::size_t k = 0; k < n_wires; k++) {
+            const auto bit = state[k];
+            index |= bit << (n_wires - 1 - wires[k]);
+        }
+
+        const std::size_t rankId = index >> this->getNumLocalQubits();
+        const std::size_t local_index =
+            compute_local_index(index, this->getNumLocalQubits());
+
+        const std::complex<PrecisionT> value(1, 0);
+        CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
+
+        BaseType::getDataBuffer().zeroInit();
+
+        auto stream_id = localStream_.get();
+
+        if (mpi_manager_.getRank() == rankId) {
+            setBasisState_CUDA(BaseType::getData(), value_cu, local_index,
+                               use_async, stream_id);
+        }
+        PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
+        mpi_manager_.Barrier();
+    }
     /**
      * @brief Set values for a batch of elements of the state-vector. This
      * method is implemented by the customized CUDA kernel defined in the
@@ -307,11 +346,9 @@ class StateVectorCudaMPI final
                 static_cast<std::size_t>(index) >> BaseType::getNumQubits();
 
             if (rankId == mpi_manager_.getRank()) {
-                int local_index =
-                    static_cast<std::size_t>(
-                        rankId * std::pow(2.0, static_cast<long double>(
-                                                   BaseType::getNumQubits()))) ^
-                    index;
+                int local_index = static_cast<int>(
+                    compute_local_index(static_cast<std::size_t>(index),
+                                        this->getNumLocalQubits()));
                 indices_local.push_back(local_index);
                 values_local.push_back(values[i]);
             }
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index 3c44179702..2e4e65c739 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -81,14 +81,6 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
             return new StateVectorT(data_ptr,
                                     static_cast<std::size_t>(arr.size()));
         }))
-        .def(
-            "setBasisStateZero",
-            [](StateVectorT &sv, const bool use_async) {
-                const std::complex<PrecisionT> value(1, 0);
-                std::size_t zero{0U};
-                sv.setBasisState(value, zero, use_async);
-            },
-            "Create Basis State to zero on GPU.")
         .def(
             "setBasisState",
             [](StateVectorT &sv, const std::vector<std::size_t> &state,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index e9f8b762d3..3b79402a64 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -86,6 +86,14 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
             })) // qubits, device
         .def(
             "setBasisState",
+            [](StateVectorT &sv, const std::vector<std::size_t> &state,
+               const std::vector<std::size_t> &wires, const bool use_async) {
+                sv.setBasisState(state, wires, use_async);
+            },
+            "Set the state vector to a basis state on GPU.")
+
+        .def(
+            "setBasisStateIndex",
             [](StateVectorT &sv, const std::size_t index,
                const bool use_async) {
                 const std::complex<PrecisionT> value(1, 0);
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
index 6dd5a01590..e8353d8280 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/tests/mpi/Test_StateVectorCudaMPI.cpp
@@ -36,6 +36,7 @@
 namespace {
 using namespace Pennylane::LightningGPU;
 using namespace Pennylane::LightningGPU::MPI;
+using namespace Pennylane::LightningGPU::Util;
 using namespace Pennylane::Util;
 
 using Pennylane::Util::isApproxEqual;
@@ -52,6 +53,24 @@ TEMPLATE_TEST_CASE("StateVectorCudaMPI::Constructibility",
     }
 }
 
+TEMPLATE_TEST_CASE("cuStateVec_helper::compute_local_index",
+                   "[Default Constructibility]", StateVectorCudaMPI<>) {
+
+    const std::size_t local_num_qubits = 4;
+
+    SECTION("compute_local_index, index inside the current qubits set") {
+        const std::size_t index = 2; // 0b00010
+        std::size_t local_index = compute_local_index(index, local_num_qubits);
+        REQUIRE(local_index == index);
+    }
+
+    SECTION("compute_local_index, index outside the current qubits set") {
+        const std::size_t index = 16; // 0b10000
+        std::size_t local_index = compute_local_index(index, local_num_qubits);
+        REQUIRE(local_index == 0);
+    }
+}
+
 TEMPLATE_PRODUCT_TEST_CASE("StateVectorCudaMPI::Constructibility",
                            "[General Constructibility]", (StateVectorCudaMPI),
                            (float, double)) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp
index 8bd27c2dc8..bc686e887a 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/utils/cuStateVec_helpers.hpp
@@ -101,4 +101,21 @@ inline SharedCusvHandle make_shared_cusv_handle() {
     PL_CUSTATEVEC_IS_SUCCESS(custatevecCreate(&h));
     return {h, handleDeleter()};
 }
+
+/**
+ * @brief Compute the local index from a given index in multi-gpu workflow
+ *
+ * @param index Global index of the target element.
+ * @param num_qubits Number of wires within the local devices.
+ */
+inline std::size_t compute_local_index(const std::size_t index,
+                                       const std::size_t num_qubits) {
+
+    // TODO: bound check for the left shift operation here
+    constexpr std::size_t one{1U};
+    const std::size_t local_index =
+        (index >> num_qubits) * (one << num_qubits) ^ index;
+    return local_index;
+}
+
 } // namespace Pennylane::LightningGPU::Util
diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 44bfcca60b..dec1417338 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -43,6 +43,8 @@
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
 
+from ._mpi_handler import MPIHandler
+
 
 class LightningGPUMeasurements(LightningBaseMeasurements):
     """Lightning GPU Measurements class
@@ -51,15 +53,21 @@ class LightningGPUMeasurements(LightningBaseMeasurements):
 
     Args:
         qubit_state(LightningGPUStateVector): Lightning state-vector class containing the state vector to be measured.
+        mpi_handler(MPIHandler): MPI handler for PennyLane Lightning GPU device.
+            Provides functionality to run on multiple devices.
+
     """
 
     def __init__(
         self,
         lgpu_state,
+        use_mpi=False,
     ) -> TensorLike:
 
         super().__init__(lgpu_state)
 
+        self._use_mpi = use_mpi
+
         self._measurement_lightning = self._measurement_dtype()(lgpu_state.state_vector)
 
     def _measurement_dtype(self):
@@ -67,7 +75,10 @@ def _measurement_dtype(self):
 
         Returns: the Measurements class
         """
-        return MeasurementsC64 if self.dtype == np.complex64 else MeasurementsC128
+        if self._use_mpi:
+            return MeasurementsMPIC128 if self.dtype == np.complex128 else MeasurementsMPIC64
+        else:
+            return MeasurementsC128 if self.dtype == np.complex128 else MeasurementsC64
 
     def _measure_with_samples_diagonalizing_gates(
         self,
diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
index ca09b8c033..2b83ae55f1 100644
--- a/pennylane_lightning/lightning_gpu/_mpi_handler.py
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -57,7 +57,7 @@ def __init__(
     ) -> None:
 
         self.use_mpi = mpi
-        self.mpi_but_size = mpi_buf_size
+        self.mpi_buf_size = mpi_buf_size
         self._dp = dev_pool
 
         if self.use_mpi:
@@ -66,7 +66,7 @@ def __init__(
                 raise ImportError("MPI related APIs are not found.")
 
             if mpi_buf_size < 0:
-                raise TypeError(f"Unsupported mpi_buf_size value: {mpi_buf_size}, should be >= 0")
+                raise ValueError(f"Unsupported mpi_buf_size value: {mpi_buf_size}, should be >= 0")
 
             if mpi_buf_size > 0 and (mpi_buf_size & (mpi_buf_size - 1)):
                 raise ValueError(
@@ -77,7 +77,7 @@ def __init__(
             self.mpi_manager, self.devtag = self._mpi_init_helper(num_wires)
 
             # set the number of global and local wires
-            commSize = self._mpi_manager.getSize()
+            commSize = self.mpi_manager.getSize()
             self.num_global_wires = commSize.bit_length() - 1
             self.num_local_wires = num_wires - self.num_global_wires
 
@@ -94,7 +94,7 @@ def _check_memory_size(self, c_dtype, mpi_buf_size):
         # Memory size in bytes
         sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
         if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
-            raise ValueError("The MPI buffer size is larger than the local state vector size.")
+            raise RuntimeError("The MPI buffer size is larger than the local state vector size.")
 
     def _mpi_init_helper(self, num_wires):
         """Set up MPI checks and initializations."""
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index 530292614e..538adff374 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -36,7 +36,7 @@
 
 import numpy as np
 import pennylane as qml
-from pennylane import DeviceError
+from pennylane import DeviceError, math
 from pennylane.measurements import MidMeasureMP
 from pennylane.ops import Conditional
 from pennylane.ops.op_math import Adjoint
@@ -108,7 +108,7 @@ def __init__(
             self._qubit_state = self._state_dtype()(self.num_wires)
 
         use_async = False
-        self._qubit_state.setBasisStateZero(use_async)
+        self.reset_state(use_async)
 
     def _state_dtype(self):
         """Binding to Lightning Managed state vector C++ class.
@@ -189,15 +189,6 @@ def _asarray(arr, dtype=None):
 
         return arr
 
-    def _create_basis_state(self, index, use_async=False):
-        """Creates a computational basis state consisting of 0s and 1s, over all wires on device.
-        Args:
-            index (int): integer representing the computational basis state.
-            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
-            Note: This function only supports synchronized memory copy.
-        """
-        self._qubit_state.setBasisState(index, use_async)
-
     def _apply_state_vector(self, state, device_wires, use_async=False):
         """Initialize the state vector on GPU with a specified state on host.
         Note that any use of this method will introduce host-overheads.
@@ -228,7 +219,7 @@ def _apply_state_vector(self, state, device_wires, use_async=False):
             if self.num_wires == self._num_local_wires:
                 self.syncH2D(np.reshape(state, output_shape))
                 return
-            local_state = np.zeros(1 << self._num_local_wires, dtype=self.C_DTYPE)
+            local_state = np.zeros(1 << self._num_local_wires, dtype=self._dtype)
             self._mpi_handler.mpi_manager.Scatter(state, local_state, 0)
             self.syncH2D(np.reshape(local_state, output_shape))
             return
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 87be05cd92..00159c1c21 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -461,7 +461,7 @@ def execute(
         results = []
         for circuit in circuits:
             if self._wire_map is not None:
-                circuit, _ = qml.map_wires(circuit, self._wire_map)
+                [circuit], _ = qml.map_wires(circuit, self._wire_map)
             results.append(
                 self.simulate(
                     circuit,
@@ -489,10 +489,12 @@ def supports_derivatives(
             Bool: Whether or not a derivative can be calculated provided the given information
 
         """
-        if circuit is None or (execution_config is None and circuit is None):
+        if execution_config is None and circuit is None:
             return True
         if execution_config.gradient_method not in {"adjoint", "best"}:
             return False
+        if circuit is None:
+            return True
         return _supports_adjoint(circuit=circuit)
 
     def simulate(
@@ -520,4 +522,6 @@ def simulate(
 
         state.reset_state(sync=False)
         final_state = state.get_final_state(circuit)
-        return LightningGPUMeasurements(final_state).measure_final_state(circuit)
+        return LightningGPUMeasurements(final_state, self._mpi_handler.use_mpi).measure_final_state(
+            circuit
+        )
diff --git a/pennylane_lightning/lightning_kokkos/_state_vector.py b/pennylane_lightning/lightning_kokkos/_state_vector.py
index b629a17dbe..8309054850 100644
--- a/pennylane_lightning/lightning_kokkos/_state_vector.py
+++ b/pennylane_lightning/lightning_kokkos/_state_vector.py
@@ -26,6 +26,8 @@
 except ImportError:
     pass
 
+from typing import Optional
+
 import numpy as np
 import pennylane as qml
 from pennylane.measurements import MidMeasureMP
diff --git a/pennylane_lightning/lightning_qubit/_state_vector.py b/pennylane_lightning/lightning_qubit/_state_vector.py
index 0a8b0b7d6d..570ae58af3 100644
--- a/pennylane_lightning/lightning_qubit/_state_vector.py
+++ b/pennylane_lightning/lightning_qubit/_state_vector.py
@@ -24,6 +24,8 @@
 except ImportError:
     pass
 
+from typing import Optional
+
 import numpy as np
 import pennylane as qml
 from pennylane.measurements import MidMeasureMP

From b658ef7a83c8b5dd8a064880a15a6b250887a03d Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Sat, 21 Sep 2024 19:17:59 +0000
Subject: [PATCH 5/5] Auto update version from '0.39.0-dev27' to '0.39.0-dev32'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index bb3de8f43b..017bfccf85 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.39.0-dev27"
+__version__ = "0.39.0-dev32"