From da755346ccb5e628e2ed9e7affe61443abee7e42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luis=20Alfredo=20Nu=C3=B1ez=20Meneses?=
 <alfredo.nunez@xanadu.ai>
Date: Mon, 16 Sep 2024 18:52:54 -0400
Subject: [PATCH] Add the `state_vector`, `measurement` class and `simulate`
 method for the LightningGPU with the new device API (#892)

### Before submitting

Please complete the following checklist when submitting a PR:

- [X] All new features must include a unit test.
If you've fixed a bug or added code that should be tested, add a test to
the
      [`tests`](../tests) directory!

- [X] All new functions and code must be clearly commented and
documented.
If you do make documentation changes, make sure that the docs build and
      render correctly by running `make docs`.

- [X] Ensure that the test suite passes, by running `make test`.

- [ ] Add a new entry to the `.github/CHANGELOG.md` file, summarizing
the
      change, and including a link back to the PR.

- [X] Ensure that code is properly formatted by running `make format`.

When all the above are checked, delete everything above the dashed
line and fill in the pull request template.


------------------------------------------------------------------------------------------------------------

**Context:**
Migrate LightningGPU to the new device API

**Description of the Change:**
Create the `state_vector`, and `measurement` class for the new device
API to achieve the `simulate` method

**Benefits:**
Integration of LGPU with the new device API

**Possible Drawbacks:**

**Related GitHub Issues:**
## **Freezzed PR** :warning: :snowflake:
To make a smooth integration of LightningGPU with the new device API, we
set the branch `gpuNewAPI_backend` as the base branch target for future
developments related to this big task.

The branch `gpuNewAPI_backend` has the mock of all classes and methods
necessary for the new API. Also, several tests were disabled with
``` python
if device_name == "lightning.gpu":
    pytest.skip("LGPU new API in WIP.  Skipping.",allow_module_level=True)
```
However, these tests will unblocked as the implementation progresses.

After all the developments for integrating LightningGPU with the new API
have been completed then the PR will be open to merge to `master`

[sc-70932]

---------

Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
---
 .../core/_measurements_base.py                |  19 +-
 .../core/_state_vector_base.py                |  30 +-
 .../lightning_gpu/StateVectorCudaManaged.hpp  |  36 ++
 .../lightning_gpu/bindings/LGPUBindings.hpp   |  20 +-
 .../bindings/LGPUBindingsMPI.hpp              |   2 +-
 .../measurements/MeasurementsGPU.hpp          |   2 +-
 .../lightning_gpu/_measurements.py            | 106 ++++++
 .../lightning_gpu/_mpi_handler.py             | 126 +++++++
 .../lightning_gpu/_state_vector.py            | 315 +++++++++++++++++-
 .../lightning_gpu/lightning_gpu.py            |  86 +++--
 .../lightning_kokkos/_measurements.py         |   4 +-
 .../lightning_qubit/_measurements.py          |   4 +-
 tests/conftest.py                             |   8 +-
 .../test_measurements_class.py                |   7 +-
 tests/lightning_qubit/test_simulate_method.py |   3 -
 .../test_state_vector_class.py                |  24 +-
 16 files changed, 717 insertions(+), 75 deletions(-)
 create mode 100644 pennylane_lightning/lightning_gpu/_mpi_handler.py

diff --git a/pennylane_lightning/core/_measurements_base.py b/pennylane_lightning/core/_measurements_base.py
index 06ae878899..dbfb46e20a 100644
--- a/pennylane_lightning/core/_measurements_base.py
+++ b/pennylane_lightning/core/_measurements_base.py
@@ -130,24 +130,37 @@ def expval(self, measurementprocess: MeasurementProcess):
             measurementprocess.obs.name, measurementprocess.obs.wires
         )
 
+    def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
+        """Convert the data structure from the C++ backend to a common structure through lightning devices.
+        Args:
+            probs_result (Any): Result provided by C++ backend.
+        Returns:
+            np.ndarray with probabilities of the supplied observable or wires.
+        """
+        return probs_results
+
     def probs(self, measurementprocess: MeasurementProcess):
         """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
 
         Args:
-            measurementprocess (StateMeasurement): measurement to apply to the state
+            measurementprocess (StateMeasurement): measurement to apply to the state.
 
         Returns:
-            Probabilities of the supplied observable or wires
+            Probabilities of the supplied observable or wires.
         """
         diagonalizing_gates = measurementprocess.diagonalizing_gates()
+
         if diagonalizing_gates:
             self._qubit_state.apply_operations(diagonalizing_gates)
+
         results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
+
         if diagonalizing_gates:
             self._qubit_state.apply_operations(
                 [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
             )
-        return results
+
+        return self._probs_retval_conversion(results)
 
     def var(self, measurementprocess: MeasurementProcess):
         """Variance of the supplied observable contained in the MeasurementProcess.
diff --git a/pennylane_lightning/core/_state_vector_base.py b/pennylane_lightning/core/_state_vector_base.py
index b2ba3a0669..bef158bc43 100644
--- a/pennylane_lightning/core/_state_vector_base.py
+++ b/pennylane_lightning/core/_state_vector_base.py
@@ -16,7 +16,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Union
+from typing import Optional, Union
 
 import numpy as np
 from pennylane import BasisState, StatePrep
@@ -35,9 +35,12 @@ class LightningBaseStateVector(ABC):
         num_wires(int): the number of wires to initialize the device with
         dtype: Datatypes for state-vector representation. Must be one of
             ``np.complex64`` or ``np.complex128``. Default is ``np.complex128``
+        sync Optional(bool): immediately sync with host-sv after applying operation.
     """
 
-    def __init__(self, num_wires: int, dtype: Union[np.complex128, np.complex64]):
+    def __init__(
+        self, num_wires: int, dtype: Union[np.complex128, np.complex64], sync: Optional[bool] = None
+    ):
 
         if dtype not in [np.complex64, np.complex128]:
             raise TypeError(f"Unsupported complex type: {dtype}")
@@ -45,6 +48,7 @@ def __init__(self, num_wires: int, dtype: Union[np.complex128, np.complex64]):
         self._num_wires = num_wires
         self._wires = Wires(range(num_wires))
         self._dtype = dtype
+        self._base_sync = sync
 
         # Dummy for the device name
         self._device_name = None
@@ -96,13 +100,16 @@ def _state_dtype(self):
         Returns: the state vector class
         """
 
-    def reset_state(self):
+    def reset_state(self, sync: Optional[bool] = None):
         """Reset the device's state"""
         # init the state vector to |00..0>
-        self._qubit_state.resetStateVector()
+        if sync == None:
+            self._qubit_state.resetStateVector()
+        else:
+            self._qubit_state.resetStateVector(sync)
 
     @abstractmethod
-    def _apply_state_vector(self, state, device_wires: Wires):
+    def _apply_state_vector(self, state, device_wires: Wires, sync: Optional[bool] = None):
         """Initialize the internal state vector in a specified state.
         Args:
             state (array[complex]): normalized input state of length ``2**len(wires)``
@@ -110,7 +117,7 @@ def _apply_state_vector(self, state, device_wires: Wires):
             device_wires (Wires): wires that get initialized in the state
         """
 
-    def _apply_basis_state(self, state, wires):
+    def _apply_basis_state(self, state, wires, use_async: Optional[bool] = None):
         """Initialize the state vector in a specified computational basis state.
 
         Args:
@@ -118,6 +125,7 @@ def _apply_basis_state(self, state, wires):
                 consisting of 0s and 1s.
             wires (Wires): wires that the provided computational state should be
                 initialized on
+            use_async(Optional[bool]): immediately sync with host-sv after applying operation.
 
         Note: This function does not support broadcasted inputs yet.
         """
@@ -128,7 +136,11 @@ def _apply_basis_state(self, state, wires):
             raise ValueError("BasisState parameter and wires must be of equal length.")
 
         # Return a computational basis state over all wires.
-        self._qubit_state.setBasisState(list(state), list(wires))
+        print("FSX:", use_async)
+        if use_async == None:
+            self._qubit_state.setBasisState(list(state), list(wires))
+        else:
+            self._qubit_state.setBasisState(list(state), list(wires), use_async)
 
     @abstractmethod
     def _apply_lightning_controlled(self, operation):
@@ -185,7 +197,9 @@ def apply_operations(
                 self._apply_state_vector(operations[0].parameters[0].copy(), operations[0].wires)
                 operations = operations[1:]
             elif isinstance(operations[0], BasisState):
-                self._apply_basis_state(operations[0].parameters[0], operations[0].wires)
+                self._apply_basis_state(
+                    operations[0].parameters[0], operations[0].wires, self._base_sync
+                )
                 operations = operations[1:]
         self._apply_lightning(
             operations, mid_measurements=mid_measurements, postselect_mode=postselect_mode
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
index 716d95c89f..174d23aea2 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -184,6 +184,42 @@ class StateVectorCudaManaged
                            stream_id);
     }
 
+    /**
+     * @brief Prepare a single computational basis state.
+     *
+     * @param state Binary number representing the index
+     * @param wires Wires.
+     * @param use_async(Optional[bool]): immediately sync with host-sv after
+     applying operation.
+
+     */
+    void setBasisState(const std::vector<std::size_t> &state,
+                       const std::vector<std::size_t> &wires,
+                       const bool use_async) {
+        PL_ABORT_IF_NOT(state.size() == wires.size(),
+                        "state and wires must have equal dimensions.");
+        const auto num_qubits = BaseType::getNumQubits();
+        PL_ABORT_IF_NOT(
+            std::find_if(wires.begin(), wires.end(),
+                         [&num_qubits](const auto i) {
+                             return i >= num_qubits;
+                         }) == wires.end(),
+            "wires must take values lower than the number of qubits.");
+        const auto n_wires = wires.size();
+        std::size_t index{0U};
+        for (std::size_t k = 0; k < n_wires; k++) {
+            const auto bit = static_cast<std::size_t>(state[k]);
+            index |= bit << (num_qubits - 1 - wires[k]);
+        }
+
+        BaseType::getDataBuffer().zeroInit();
+        const std::complex<PrecisionT> value(1, 0);
+        CFP_t value_cu = cuUtil::complexToCu<std::complex<Precision>>(value);
+        auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();
+        setBasisState_CUDA(BaseType::getData(), value_cu, index, use_async,
+                           stream_id);
+    }
+
     /**
      * @brief Set values for a batch of elements of the state-vector. This
      * method is implemented by the customized CUDA kernel defined in the
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
index 5bd92b5520..3c44179702 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindings.hpp
@@ -82,13 +82,21 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
                                     static_cast<std::size_t>(arr.size()));
         }))
         .def(
-            "setBasisState",
-            [](StateVectorT &sv, const std::size_t index,
-               const bool use_async) {
+            "setBasisStateZero",
+            [](StateVectorT &sv, const bool use_async) {
                 const std::complex<PrecisionT> value(1, 0);
-                sv.setBasisState(value, index, use_async);
+                std::size_t zero{0U};
+                sv.setBasisState(value, zero, use_async);
             },
-            "Create Basis State on GPU.")
+            "Create Basis State to zero on GPU.")
+        .def(
+            "setBasisState",
+            [](StateVectorT &sv, const std::vector<std::size_t> &state,
+               const std::vector<std::size_t> &wires, const bool use_async) {
+                sv.setBasisState(state, wires, use_async);
+            },
+            "Set the state vector to a basis state on GPU.")
+
         .def(
             "setStateVector",
             [](StateVectorT &sv, const np_arr_sparse_ind &indices,
@@ -152,7 +160,7 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
              "Get the GPU index for the statevector data.")
         .def("numQubits", &StateVectorT::getNumQubits)
         .def("dataLength", &StateVectorT::getLength)
-        .def("resetGPU", &StateVectorT::initSV)
+        .def("resetStateVector", &StateVectorT::initSV)
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
index 620fd93868..e9f8b762d3 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/bindings/LGPUBindingsMPI.hpp
@@ -155,7 +155,7 @@ void registerBackendClassSpecificBindingsMPI(PyClass &pyclass) {
              "Get the GPU index for the statevector data.")
         .def("numQubits", &StateVectorT::getNumQubits)
         .def("dataLength", &StateVectorT::getLength)
-        .def("resetGPU", &StateVectorT::initSV)
+        .def("resetStateVector", &StateVectorT::initSV)
         .def(
             "apply",
             [](StateVectorT &sv, const std::string &str,
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
index 460a4fa8cb..fe19b5d025 100644
--- a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
@@ -273,7 +273,7 @@ class Measurements final
         PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerSample(
             this->_statevector.getCusvHandle(), sampler, bitStrings.data(),
             bitOrdering.data(), bitStringLen, rand_nums.data(), num_samples,
-            CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER));
+            CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
         PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
             this->_statevector.getDataBuffer().getDevTag().getStreamID()));
 
diff --git a/pennylane_lightning/lightning_gpu/_measurements.py b/pennylane_lightning/lightning_gpu/_measurements.py
index 3f4890c55d..44bfcca60b 100644
--- a/pennylane_lightning/lightning_gpu/_measurements.py
+++ b/pennylane_lightning/lightning_gpu/_measurements.py
@@ -15,8 +15,30 @@
 Class implementation for state vector measurements.
 """
 
+from warnings import warn
+
+try:
+    from pennylane_lightning.lightning_gpu_ops import MeasurementsC64, MeasurementsC128
+
+    try:
+        from pennylane_lightning.lightning_gpu_ops import MeasurementsMPIC64, MeasurementsMPIC128
+
+        MPI_SUPPORT = True
+    except ImportError as ex:
+        warn(str(ex), UserWarning)
+
+        MPI_SUPPORT = False
+
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
+    pass
+
+from typing import Any, List
+
 import numpy as np
 import pennylane as qml
+from pennylane.measurements import CountsMP, MeasurementProcess, SampleMeasurement, Shots
 from pennylane.typing import TensorLike
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
@@ -37,3 +59,87 @@ def __init__(
     ) -> TensorLike:
 
         super().__init__(lgpu_state)
+
+        self._measurement_lightning = self._measurement_dtype()(lgpu_state.state_vector)
+
+    def _measurement_dtype(self):
+        """Binding to Lightning GPU Measurements C++ class.
+
+        Returns: the Measurements class
+        """
+        return MeasurementsC64 if self.dtype == np.complex64 else MeasurementsC128
+
+    def _measure_with_samples_diagonalizing_gates(
+        self,
+        mps: List[SampleMeasurement],
+        shots: Shots,
+    ) -> TensorLike:
+        """
+        Returns the samples of the measurement process performed on the given state,
+        by rotating the state into the measurement basis using the diagonalizing gates
+        given by the measurement process.
+
+        Args:
+            mps (~.measurements.SampleMeasurement): The sample measurements to perform
+            shots (~.measurements.Shots): The number of samples to take
+
+        Returns:
+            TensorLike[Any]: Sample measurement results
+        """
+        # apply diagonalizing gates
+        self._apply_diagonalizing_gates(mps)
+
+        # Specific for LGPU:
+        total_indices = self._qubit_state.num_wires
+        wires = qml.wires.Wires(range(total_indices))
+
+        def _process_single_shot(samples):
+            processed = []
+            for mp in mps:
+                res = mp.process_samples(samples, wires)
+                if not isinstance(mp, CountsMP):
+                    res = qml.math.squeeze(res)
+
+                processed.append(res)
+
+            return tuple(processed)
+
+        try:
+            samples = self._measurement_lightning.generate_samples(
+                len(wires), shots.total_shots
+            ).astype(int, copy=False)
+
+        except ValueError as ex:
+            if str(ex) != "probabilities contain NaN":
+                raise ex
+            samples = qml.math.full((shots.total_shots, len(wires)), 0)
+
+        self._apply_diagonalizing_gates(mps, adjoint=True)
+
+        # if there is a shot vector, use the shots.bins generator to
+        # split samples w.r.t. the shots
+        processed_samples = []
+        for lower, upper in shots.bins():
+            result = _process_single_shot(samples[..., lower:upper, :])
+            processed_samples.append(result)
+
+        return (
+            tuple(zip(*processed_samples)) if shots.has_partitioned_shots else processed_samples[0]
+        )
+
+    def _probs_retval_conversion(self, probs_results: Any) -> np.ndarray:
+        """Convert the data structure from the C++ backend to a common structure through lightning devices.
+
+        Args:
+            probs_result (Any): Result provided by C++ backend.
+
+        Returns:
+            np.ndarray with probabilities of the supplied observable or wires.
+        """
+
+        # Device returns as col-major orderings, so perform transpose on data for bit-index shuffle for now.
+        if len(probs_results) > 0:
+            num_local_wires = len(probs_results).bit_length() - 1 if len(probs_results) > 0 else 0
+            return probs_results.reshape([2] * num_local_wires).transpose().reshape(-1)
+
+        return probs_results
diff --git a/pennylane_lightning/lightning_gpu/_mpi_handler.py b/pennylane_lightning/lightning_gpu/_mpi_handler.py
new file mode 100644
index 0000000000..ca09b8c033
--- /dev/null
+++ b/pennylane_lightning/lightning_gpu/_mpi_handler.py
@@ -0,0 +1,126 @@
+# Copyright 2022-2024 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module contains the :class:`~.LightningGPU_MPIHandler` class, a MPI handler to use LightningGPU device with multi-GPU on multi-node system.
+"""
+
+from warnings import warn
+
+try:
+    # pylint: disable=no-name-in-module
+    from pennylane_lightning.lightning_gpu_ops import DevTag, MPIManager
+
+    MPI_SUPPORT = True
+except ImportError as ex:
+    print(str(ex), UserWarning)
+    MPI_SUPPORT = False
+
+from typing import Callable, Union
+
+import numpy as np
+
+
+# MPI options
+class MPIHandler:
+    """MPI handler for PennyLane Lightning GPU device.
+
+    MPI handler to use a GPU-backed Lightning device using NVIDIA cuQuantum SDK with parallel capabilities.
+
+    Use the MPI library is necessary to initialize different variables and methods to handle the data across nodes and perform checks for memory allocation on each device.
+
+    Args:
+        mpi (bool): declare if the device will use the MPI support.
+        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
+        dev_pool (Callable): Method to handle the GPU devices available.
+        num_wires (int): the number of wires to initialize the device with.
+        c_dtype (np.complex64, np.complex128): Datatypes for statevector representation.
+    """
+
+    def __init__(
+        self,
+        mpi: bool,
+        mpi_buf_size: int,
+        dev_pool: Callable,
+        num_wires: int,
+        c_dtype: Union[np.complex64, np.complex128],
+    ) -> None:
+
+        self.use_mpi = mpi
+        self.mpi_but_size = mpi_buf_size
+        self._dp = dev_pool
+
+        if self.use_mpi:
+
+            if not MPI_SUPPORT:
+                raise ImportError("MPI related APIs are not found.")
+
+            if mpi_buf_size < 0:
+                raise TypeError(f"Unsupported mpi_buf_size value: {mpi_buf_size}, should be >= 0")
+
+            if mpi_buf_size > 0 and (mpi_buf_size & (mpi_buf_size - 1)):
+                raise ValueError(
+                    f"Unsupported mpi_buf_size value: {mpi_buf_size}. mpi_buf_size should be power of 2."
+                )
+
+            # After check if all MPI parameters are ok
+            self.mpi_manager, self.devtag = self._mpi_init_helper(num_wires)
+
+            # set the number of global and local wires
+            commSize = self._mpi_manager.getSize()
+            self.num_global_wires = commSize.bit_length() - 1
+            self.num_local_wires = num_wires - self.num_global_wires
+
+            self._check_memory_size(c_dtype, mpi_buf_size)
+
+        if not self.use_mpi:
+            self.num_local_wires = num_wires
+            self.num_global_wires = num_wires
+
+    def _mebibytesToBytes(self, mebibytes):
+        return mebibytes * 1024 * 1024
+
+    def _check_memory_size(self, c_dtype, mpi_buf_size):
+        # Memory size in bytes
+        sv_memsize = np.dtype(c_dtype).itemsize * (1 << self.num_local_wires)
+        if self._mebibytesToBytes(mpi_buf_size) > sv_memsize:
+            raise ValueError("The MPI buffer size is larger than the local state vector size.")
+
+    def _mpi_init_helper(self, num_wires):
+        """Set up MPI checks and initializations."""
+
+        # initialize MPIManager and config check in the MPIManager ctor
+        mpi_manager = MPIManager()
+
+        # check if number of GPUs per node is larger than number of processes per node
+        numDevices = self._dp.getTotalDevices()
+        numProcsNode = mpi_manager.getSizeNode()
+
+        if numDevices < numProcsNode:
+            raise ValueError(
+                "Number of devices should be larger than or equal to the number of processes on each node."
+            )
+
+        # check if the process number is larger than number of statevector elements
+        if mpi_manager.getSize() > (1 << (num_wires - 1)):
+            raise ValueError(
+                "Number of processes should be smaller than the number of statevector elements."
+            )
+
+        # set GPU device
+        rank = mpi_manager.getRank()
+        deviceid = rank % numProcsNode
+        self._dp.setDeviceID(deviceid)
+        devtag = DevTag(deviceid)
+
+        return (mpi_manager, devtag)
diff --git a/pennylane_lightning/lightning_gpu/_state_vector.py b/pennylane_lightning/lightning_gpu/_state_vector.py
index d52875e337..530292614e 100644
--- a/pennylane_lightning/lightning_gpu/_state_vector.py
+++ b/pennylane_lightning/lightning_gpu/_state_vector.py
@@ -14,15 +14,48 @@
 """
 Class implementation for lightning_gpu state-vector manipulation.
 """
+from warnings import warn
+
+try:
+    from pennylane_lightning.lightning_gpu_ops import StateVectorC64, StateVectorC128
+
+    try:  # Try to import the MPI modules
+        # pylint: disable=no-name-in-module
+        from pennylane_lightning.lightning_gpu_ops import StateVectorMPIC64, StateVectorMPIC128
+
+        MPI_SUPPORT = True
+    except ImportError as ex:
+        warn(str(ex), UserWarning)
+
+        MPI_SUPPORT = False
+
+except ImportError as ex:
+    warn(str(ex), UserWarning)
+
+from itertools import product
 
 import numpy as np
 import pennylane as qml
 from pennylane import DeviceError
+from pennylane.measurements import MidMeasureMP
+from pennylane.ops import Conditional
+from pennylane.ops.op_math import Adjoint
 from pennylane.wires import Wires
 
+from pennylane_lightning.core._serialize import global_phase_diagonal
 from pennylane_lightning.core._state_vector_base import LightningBaseStateVector
 
-from ._measurements import LightningGPUMeasurements
+from ._mpi_handler import MPIHandler
+
+gate_cache_needs_hash = (
+    qml.BlockEncode,
+    qml.ControlledQubitUnitary,
+    qml.DiagonalQubitUnitary,
+    qml.MultiControlledX,
+    qml.OrbitalRotation,
+    qml.PSWAP,
+    qml.QubitUnitary,
+)
 
 
 class LightningGPUStateVector(LightningBaseStateVector):
@@ -35,13 +68,283 @@ class LightningGPUStateVector(LightningBaseStateVector):
         dtype: Datatypes for state-vector representation. Must be one of
             ``np.complex64`` or ``np.complex128``. Default is ``np.complex128``
         device_name(string): state vector device name. Options: ["lightning.gpu"]
+        mpi_handler(MPIHandler): MPI handler for PennyLane Lightning GPU device.
+            Provides functionality to run on multiple devices.
+        sync (bool): immediately sync with host-sv after applying operation.
     """
 
-    def __init__(self, num_wires, dtype=np.complex128, device_name="lightning.gpu"):
+    def __init__(
+        self,
+        num_wires,
+        dtype=np.complex128,
+        device_name="lightning.gpu",
+        mpi_handler=None,
+        sync=True,
+    ):
 
-        if device_name != "lightning.gpu":
-            raise DeviceError(f'The device name "{device_name}" is not a valid option.')
-
-        super().__init__(num_wires, dtype)
+        super().__init__(num_wires, dtype, sync=sync)
 
         self._device_name = device_name
+
+        if mpi_handler is None:
+            mpi_handler = MPIHandler(False, 0, None, num_wires, dtype)
+
+        self._num_global_wires = mpi_handler.num_global_wires
+        self._num_local_wires = mpi_handler.num_local_wires
+
+        self._mpi_handler = mpi_handler
+        self._sync = sync
+
+        # Initialize the state vector
+        if self._mpi_handler.use_mpi:  # using MPI
+            self._qubit_state = self._state_dtype()(
+                self._mpi_handler.mpi_manager,
+                self._mpi_handler.devtag,
+                self._mpi_handler.mpi_buf_size,
+                self._mpi_handler.num_global_wires,
+                self._mpi_handler.num_local_wires,
+            )
+        else:  # without MPI
+            self._qubit_state = self._state_dtype()(self.num_wires)
+
+        use_async = False
+        self._qubit_state.setBasisStateZero(use_async)
+
+    def _state_dtype(self):
+        """Binding to Lightning Managed state vector C++ class.
+
+        Returns: the state vector class
+        """
+        if self._mpi_handler.use_mpi:
+            return StateVectorMPIC128 if self.dtype == np.complex128 else StateVectorMPIC64
+        else:
+            return StateVectorC128 if self.dtype == np.complex128 else StateVectorC64
+
+    def syncD2H(self, state_vector, use_async=False):
+        """Copy the state vector data on device to a state vector on the host provided by the user.
+        Args:
+            state_vector(array[complex]): the state vector array on host.
+            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+            Note: This function only supports synchronized memory copy.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=1)
+        >>> dev.apply([qml.PauliX(wires=[0])])
+        >>> state_vector = np.zeros(2**dev.num_wires).astype(dev.C_DTYPE)
+        >>> dev.syncD2H(state_vector)
+        >>> print(state_vector)
+        [0.+0.j 1.+0.j]
+        """
+        self._qubit_state.DeviceToHost(state_vector.ravel(order="C"), use_async)
+
+    @property
+    def state(self):
+        """Copy the state vector data from the device to the host.
+
+        A state vector Numpy array is explicitly allocated on the host to store and return the data.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=1)
+        >>> dev.apply([qml.PauliX(wires=[0])])
+        >>> print(dev.state)
+        [0.+0.j 1.+0.j]
+        """
+        state = np.zeros(1 << self._num_local_wires, dtype=self.dtype)
+        self.syncD2H(state)
+        return state
+
+    def syncH2D(self, state_vector, use_async=False):
+        """Copy the state vector data on host provided by the user to the state vector on the device
+        Args:
+            state_vector(array[complex]): the state vector array on host.
+            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+            Note: This function only supports synchronized memory copy.
+
+        **Example**
+
+        >>> dev = qml.device('lightning.gpu', wires=3)
+        >>> obs = qml.Identity(0) @ qml.PauliX(1) @ qml.PauliY(2)
+        >>> obs1 = qml.Identity(1)
+        >>> H = qml.Hamiltonian([1.0, 1.0], [obs1, obs])
+        >>> state_vector = np.array([0.0 + 0.0j, 0.0 + 0.1j, 0.1 + 0.1j, 0.1 + 0.2j,
+            0.2 + 0.2j, 0.3 + 0.3j, 0.3 + 0.4j, 0.4 + 0.5j,], dtype=np.complex64,)
+        >>> dev.syncH2D(state_vector)
+        >>> res = dev.expval(H)
+        >>> print(res)
+        1.0
+        """
+        self._qubit_state.HostToDevice(state_vector.ravel(order="C"), use_async)
+
+    @staticmethod
+    def _asarray(arr, dtype=None):
+        arr = np.asarray(arr)  # arr is not copied
+
+        if arr.dtype.kind not in ["f", "c"]:
+            return arr
+
+        if not dtype:
+            dtype = arr.dtype
+
+        return arr
+
+    def _create_basis_state(self, index, use_async=False):
+        """Creates a computational basis state consisting of 0s and 1s, over all wires on device.
+        Args:
+            index (int): integer representing the computational basis state.
+            use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+            Note: This function only supports synchronized memory copy.
+        """
+        self._qubit_state.setBasisState(index, use_async)
+
+    def _apply_state_vector(self, state, device_wires, use_async=False):
+        """Initialize the state vector on GPU with a specified state on host.
+        Note that any use of this method will introduce host-overheads.
+        Args:
+        state (array[complex]): normalized input state (on host) of length ``2**len(wires)``
+                or broadcasted state of shape ``(batch_size, 2**len(wires))``
+        device_wires (Wires): wires that get initialized in the state
+        use_async(bool): indicates whether to use asynchronous memory copy from host to device or not.
+        Note: This function only supports synchronized memory copy from host to device.
+        """
+
+        if isinstance(state, self._qubit_state.__class__):
+            raise DeviceError("LightningGPU does not support allocate external state_vector.")
+
+            # TODO
+            # Create an implementation in the C++ backend and binding to be able
+            # to allocate memory for a new statevector and copy the data
+            # from an external state vector.
+            # state_data = allocate_aligned_array(state.size, np.dtype(self.dtype), True)
+            # state.getState(state_data)
+            # state = state_data
+
+        state = self._asarray(state, dtype=self.dtype)  # this operation on host
+        output_shape = [2] * self._num_local_wires
+
+        if len(device_wires) == self.num_wires and Wires(sorted(device_wires)) == device_wires:
+            # Initialize the entire device state with the input state
+            if self.num_wires == self._num_local_wires:
+                self.syncH2D(np.reshape(state, output_shape))
+                return
+            local_state = np.zeros(1 << self._num_local_wires, dtype=self.C_DTYPE)
+            self._mpi_handler.mpi_manager.Scatter(state, local_state, 0)
+            self.syncH2D(np.reshape(local_state, output_shape))
+            return
+
+        # generate basis states on subset of qubits via the cartesian product
+        basis_states = np.array(list(product([0, 1], repeat=len(device_wires))))
+
+        # get basis states to alter on full set of qubits
+        unravelled_indices = np.zeros((1 << len(device_wires), self.num_wires), dtype=int)
+        unravelled_indices[:, device_wires] = basis_states
+
+        # get indices for which the state is changed to input state vector elements
+        ravelled_indices = np.ravel_multi_index(unravelled_indices.T, [2] * self.num_wires)
+
+        # set the state vector on GPU with the unravelled_indices and their corresponding values
+        self._qubit_state.setStateVector(
+            ravelled_indices, state, use_async
+        )  # this operation on device
+
+    def _apply_lightning_controlled(self, operation):
+        """Apply an arbitrary controlled operation to the state tensor.
+
+        Args:
+            operation (~pennylane.operation.Operation): controlled operation to apply
+
+        Returns:
+            None
+        """
+        state = self.state_vector
+
+        control_wires = list(operation.control_wires)
+        control_values = operation.control_values
+        name = operation.name
+        # Apply GlobalPhase
+        inv = False
+        param = operation.parameters[0]
+        wires = self.wires.indices(operation.wires)
+        matrix = global_phase_diagonal(param, self.wires, control_wires, control_values)
+        state.apply(name, wires, inv, [[param]], matrix)
+
+    def _apply_lightning_midmeasure(
+        self, operation: MidMeasureMP, mid_measurements: dict, postselect_mode: str
+    ):
+        """Execute a MidMeasureMP operation and return the sample in mid_measurements.
+
+        Args:
+            operation (~pennylane.operation.Operation): mid-circuit measurement
+            mid_measurements (None, dict): Dictionary of mid-circuit measurements
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots.
+
+        Returns:
+            None
+        """
+        raise DeviceError("LightningGPU does not support Mid-circuit measurements.")
+
+    def _apply_lightning(
+        self, operations, mid_measurements: dict = None, postselect_mode: str = None
+    ):
+        """Apply a list of operations to the state vector.
+
+        Args:
+            operations (list[~pennylane.operation.Operation]): operations to apply
+            mid_measurements (None, dict): Dictionary of mid-circuit measurements
+            postselect_mode (str): Configuration for handling shots with mid-circuit measurement
+                postselection. Use ``"hw-like"`` to discard invalid shots and ``"fill-shots"`` to
+                keep the same number of shots. Default is ``None``.
+
+        Returns:
+            None
+        """
+        state = self.state_vector
+
+        # Skip over identity operations instead of performing
+        # matrix multiplication with it.
+        for operation in operations:
+            if isinstance(operation, qml.Identity):
+                continue
+            if isinstance(operation, Adjoint):
+                name = operation.base.name
+                invert_param = True
+            else:
+                name = operation.name
+                invert_param = False
+            method = getattr(state, name, None)
+            wires = list(operation.wires)
+
+            if method is not None:  # apply specialized gate
+                param = operation.parameters
+                method(wires, invert_param, param)
+            elif isinstance(operation, qml.ops.Controlled) and isinstance(
+                operation.base, qml.GlobalPhase
+            ):  # apply n-controlled gate
+                # LGPU do not support the controlled gates except for GlobalPhase
+                self._apply_lightning_controlled(operation)
+            else:  # apply gate as a matrix
+                try:
+                    mat = qml.matrix(operation)
+                except AttributeError:  # pragma: no cover
+                    # To support older versions of PL
+                    mat = operation.matrix
+
+                r_dtype = np.float32 if self.dtype == np.complex64 else np.float64
+                param = (
+                    [[r_dtype(operation.hash)]]
+                    if isinstance(operation, gate_cache_needs_hash)
+                    else []
+                )
+                if len(mat) == 0:
+                    raise ValueError("Unsupported operation")
+
+                self._qubit_state.apply(
+                    name,
+                    wires,
+                    False,
+                    param,
+                    mat.ravel(order="C"),  # inv = False: Matrix already in correct form;
+                )  # Parameters can be ignored for explicit matrices; F-order for cuQuantum
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index 4d4808f668..f43083af70 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -19,15 +19,15 @@
 
 from ctypes.util import find_library
 from importlib import util as imp_util
-from numbers import Number
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Callable, Optional, Tuple, Union
 from warnings import warn
 
 import numpy as np
 import pennylane as qml
 from pennylane.devices import DefaultExecutionConfig, ExecutionConfig
 from pennylane.devices.modifiers import simulator_tracking, single_tape_support
+from pennylane.measurements import MidMeasureMP
 from pennylane.operation import Operator
 from pennylane.tape import QuantumScript, QuantumTape
 from pennylane.transforms.core import TransformProgram
@@ -41,31 +41,30 @@
 
 from ._adjoint_jacobian import LightningGPUAdjointJacobian
 from ._measurements import LightningGPUMeasurements
+from ._mpi_handler import MPIHandler
 from ._state_vector import LightningGPUStateVector
 
 try:
+    from pennylane_lightning.lightning_gpu_ops import (
+        DevPool,
+        backend_info,
+        get_gpu_arch,
+        is_gpu_supported,
+    )
 
-    from pennylane_lightning.lightning_gpu_ops import backend_info
+    LGPU_CPP_BINARY_AVAILABLE = True
 
     try:
         # pylint: disable=no-name-in-module
-        from pennylane_lightning.lightning_gpu_ops import MPIManager
+        from pennylane_lightning.lightning_gpu_ops import DevTag, MPIManager
+
+        from ._mpi_handler import LightningGPU_MPIHandler
 
         MPI_SUPPORT = True
     except ImportError as ex:
         warn(str(ex), UserWarning)
         MPI_SUPPORT = False
 
-    if find_library("custatevec") is None and not imp_util.find_spec("cuquantum"):
-        raise ImportError(
-            "custatevec libraries not found. Please pip install the appropriate custatevec library in a virtual environment."
-        )
-    # if not DevPool.getTotalDevices():  # pragma: no cover
-    #     raise ValueError("No supported CUDA-capable device found")
-
-    # if not is_gpu_supported():  # pragma: no cover
-    #     raise ValueError(f"CUDA device is an unsupported version: {get_gpu_arch()}")
-
     LGPU_CPP_BINARY_AVAILABLE = True
 except (ImportError, ValueError) as ex:
     warn(str(ex), UserWarning)
@@ -73,10 +72,6 @@
     LGPU_CPP_BINARY_AVAILABLE = False
 
 
-def _mebibytesToBytes(mebibytes):
-    return mebibytes * 1024 * 1024
-
-
 _operations = frozenset(
     {
         "Identity",
@@ -159,16 +154,6 @@ def _mebibytesToBytes(mebibytes):
     }
 )
 
-gate_cache_needs_hash = (
-    qml.BlockEncode,
-    qml.ControlledQubitUnitary,
-    qml.DiagonalQubitUnitary,
-    qml.MultiControlledX,
-    qml.OrbitalRotation,
-    qml.PSWAP,
-    qml.QubitUnitary,
-)
-
 
 def stopping_condition(op: Operator) -> bool:
     """A function that determines whether or not an operation is supported by ``lightning.gpu``."""
@@ -224,6 +209,21 @@ def _add_adjoint_transforms(program: TransformProgram) -> None:
     return 0
 
 
+def check_gpu_resources() -> None:
+    """Check the available resources of each Nvidia GPU"""
+    if find_library("custatevec") is None and not imp_util.find_spec("cuquantum"):
+
+        raise ImportError(
+            "cuStateVec libraries not found. Please pip install the appropriate cuStateVec library in a virtual environment."
+        )
+
+    if not DevPool.getTotalDevices():
+        raise ValueError("No supported CUDA-capable device found")
+
+    if not is_gpu_supported():
+        raise ValueError(f"CUDA device is an unsupported version: {get_gpu_arch()}")
+
+
 @simulator_tracking
 @single_tape_support
 class LightningGPU(LightningBase):
@@ -245,6 +245,9 @@ class LightningGPU(LightningBase):
         batch_obs (bool): Determine whether we process observables in parallel when
             computing the jacobian. This value is only relevant when the lightning.gpu
             is built with MPI. Default is False.
+        mpi (bool): declare if the device will use the MPI support.
+        mpi_buf_size (int): size of GPU memory (in MiB) set for MPI operation and its default value is 64 MiB.
+        sync (bool): immediately sync with host-sv after applying operation.
     """
 
     # General device options
@@ -271,7 +274,10 @@ def __init__(  # pylint: disable=too-many-arguments
         c_dtype=np.complex128,
         shots=None,
         batch_obs=False,
-        # GPU arguments
+        # GPU and MPI arguments
+        mpi: bool = False,
+        mpi_buf_size: int = 0,
+        sync: bool = False,
     ):
         if not self._CPP_BINARY_AVAILABLE:
             raise ImportError(
@@ -280,6 +286,8 @@ def __init__(  # pylint: disable=too-many-arguments
                 "https://docs.pennylane.ai/projects/lightning/en/stable/dev/installation.html."
             )
 
+        check_gpu_resources()
+
         super().__init__(
             wires=wires,
             c_dtype=c_dtype,
@@ -288,19 +296,28 @@ def __init__(  # pylint: disable=too-many-arguments
         )
 
         # Set the attributes to call the LightningGPU classes
+        self._set_lightning_classes()
 
         # GPU specific options
+        self._dp = DevPool()
+        self._sync = sync
 
         # Creating the state vector
+        self._mpi_handler = MPIHandler(mpi, mpi_buf_size, self._dp, len(self.wires), c_dtype)
+
+        self._statevector = self.LightningStateVector(
+            num_wires=len(self.wires), dtype=c_dtype, mpi_handler=self._mpi_handler, sync=self._sync
+        )
 
     @property
     def name(self):
         """The name of the device."""
         return "lightning.gpu"
 
-    def _set_Lightning_classes(self):
+    def _set_lightning_classes(self):
         """Load the LightningStateVector, LightningMeasurements, LightningAdjointJacobian as class attribute"""
-        return 0
+        self.LightningStateVector = LightningGPUStateVector
+        self.LightningMeasurements = LightningGPUMeasurements
 
     def _setup_execution_config(self, config):
         """
@@ -384,4 +401,9 @@ def simulate(
 
         Note that this function can return measurements for non-commuting observables simultaneously.
         """
-        return 0
+        if circuit.shots and (any(isinstance(op, MidMeasureMP) for op in circuit.operations)):
+            raise qml.DeviceError("LightningGPU does not support Mid-circuit measurements.")
+
+        state.reset_state(sync=False)
+        final_state = state.get_final_state(circuit)
+        return LightningGPUMeasurements(final_state).measure_final_state(circuit)
diff --git a/pennylane_lightning/lightning_kokkos/_measurements.py b/pennylane_lightning/lightning_kokkos/_measurements.py
index 6e706614ba..46260f7edb 100644
--- a/pennylane_lightning/lightning_kokkos/_measurements.py
+++ b/pennylane_lightning/lightning_kokkos/_measurements.py
@@ -21,11 +21,11 @@
 except ImportError:
     pass
 
-from typing import List
+from typing import Any, List
 
 import numpy as np
 import pennylane as qml
-from pennylane.measurements import CountsMP, SampleMeasurement, Shots
+from pennylane.measurements import CountsMP, MeasurementProcess, SampleMeasurement, Shots
 from pennylane.typing import TensorLike
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
diff --git a/pennylane_lightning/lightning_qubit/_measurements.py b/pennylane_lightning/lightning_qubit/_measurements.py
index f762fcb7e6..71047e6d19 100644
--- a/pennylane_lightning/lightning_qubit/_measurements.py
+++ b/pennylane_lightning/lightning_qubit/_measurements.py
@@ -22,11 +22,11 @@
     pass
 
 from functools import reduce
-from typing import List
+from typing import Any, List
 
 import numpy as np
 import pennylane as qml
-from pennylane.measurements import CountsMP, SampleMeasurement, Shots
+from pennylane.measurements import CountsMP, MeasurementProcess, SampleMeasurement, Shots
 from pennylane.typing import TensorLike
 
 from pennylane_lightning.core._measurements_base import LightningBaseMeasurements
diff --git a/tests/conftest.py b/tests/conftest.py
index b5ddf416ce..ace8debfd9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -142,10 +142,14 @@ def get_device():
         from pennylane_lightning.lightning_kokkos_ops import LightningException
 elif device_name == "lightning.gpu":
     from pennylane_lightning.lightning_gpu import LightningGPU as LightningDevice
+    from pennylane_lightning.lightning_gpu._measurements import (
+        LightningGPUMeasurements as LightningMeasurements,
+    )
+    from pennylane_lightning.lightning_gpu._state_vector import (
+        LightningGPUStateVector as LightningStateVector,
+    )
 
     LightningAdjointJacobian = None
-    LightningMeasurements = None
-    LightningStateVector = None
 
     if hasattr(pennylane_lightning, "lightning_gpu_ops"):
         import pennylane_lightning.lightning_gpu_ops as lightning_ops
diff --git a/tests/lightning_qubit/test_measurements_class.py b/tests/lightning_qubit/test_measurements_class.py
index fff60006e6..f89786b8c5 100644
--- a/tests/lightning_qubit/test_measurements_class.py
+++ b/tests/lightning_qubit/test_measurements_class.py
@@ -38,9 +38,6 @@
         allow_module_level=True,
     )
 
-if device_name == "lightning.gpu":
-    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
-
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)
 
@@ -613,6 +610,10 @@ def test_double_return_value(self, shots, measurement, obs0_, obs1_, lightning_s
             # allclose -> absolute(r - e) <= (atol + rtol * absolute(e))
             assert np.allclose(r, e, atol=dtol, rtol=dtol)
 
+    @pytest.mark.skipif(
+        device_name == "lightning.gpu",
+        reason="lightning.gpu does not support out of order prob.",
+    )
     @pytest.mark.parametrize(
         "cases",
         [
diff --git a/tests/lightning_qubit/test_simulate_method.py b/tests/lightning_qubit/test_simulate_method.py
index 9dfecb64d3..ff5536846d 100644
--- a/tests/lightning_qubit/test_simulate_method.py
+++ b/tests/lightning_qubit/test_simulate_method.py
@@ -28,9 +28,6 @@
         allow_module_level=True,
     )
 
-if device_name == "lightning.gpu":
-    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
-
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)
 
diff --git a/tests/lightning_qubit/test_state_vector_class.py b/tests/lightning_qubit/test_state_vector_class.py
index 340b147b9b..05f99a9f1f 100644
--- a/tests/lightning_qubit/test_state_vector_class.py
+++ b/tests/lightning_qubit/test_state_vector_class.py
@@ -30,6 +30,9 @@
     except ImportError:
         pass
 
+if device_name == "lightning.gpu":
+    from pennylane_lightning.lightning_gpu._mpi_handler import MPIHandler
+
 if device_name == "lightning.tensor":
     pytest.skip("Skipping tests for the LightningTensor class.", allow_module_level=True)
 
@@ -39,8 +42,6 @@
         allow_module_level=True,
     )
 
-if device_name == "lightning.gpu":
-    pytest.skip("LGPU new API in WIP.  Skipping.", allow_module_level=True)
 
 if not LightningDevice._CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
@@ -89,10 +90,18 @@ def test_apply_state_vector_with_lightning_handle(tol):
     state_vector_1 = LightningStateVector(2)
     state_vector_1.apply_operations([qml.BasisState(np.array([0, 1]), wires=[0, 1])])
 
-    state_vector_2 = LightningStateVector(2)
-    state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
+    if device_name == "lightning.gpu":
+        with pytest.raises(
+            qml.DeviceError, match="LightningGPU does not support allocate external state_vector."
+        ):
+            state_vector_2 = LightningStateVector(2)
+            state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
+
+    else:
+        state_vector_2 = LightningStateVector(2)
+        state_vector_2._apply_state_vector(state_vector_1.state_vector, Wires([0, 1]))
 
-    assert np.allclose(state_vector_1.state, state_vector_2.state, atol=tol, rtol=0)
+        assert np.allclose(state_vector_1.state, state_vector_2.state, atol=tol, rtol=0)
 
 
 @pytest.mark.parametrize(
@@ -146,7 +155,10 @@ def test_reset_state(tol, operation, par):
     state_vector = LightningStateVector(wires)
     state_vector.apply_operations([operation(np.array(par), Wires(range(wires)))])
 
-    state_vector.reset_state()
+    if device_name == "lightning.gpu":
+        state_vector.reset_state(sync=False)
+    else:
+        state_vector.reset_state()
 
     expected_output = np.array([1, 0, 0, 0], dtype=state_vector.dtype)
     assert np.allclose(state_vector.state, expected_output, atol=tol, rtol=0)