diff --git a/doc/releases/changelog-dev.md b/doc/releases/changelog-dev.md
index b935938629c..30617b48e91 100644
--- a/doc/releases/changelog-dev.md
+++ b/doc/releases/changelog-dev.md
@@ -139,9 +139,10 @@
   [(#4628)](https://github.com/PennyLaneAI/pennylane/pull/4628)
   [(#4649)](https://github.com/PennyLaneAI/pennylane/pull/4649)
 
-* The `JacobianProductCalculator` abstract base class and implementation `TransformJacobianProducts`
-  have been added to `pennylane.interfaces.jacobian_products`.
+* The `JacobianProductCalculator` abstract base class and implementations `TransformJacobianProducts`
+  and `DeviceDerivatives` have been added to `pennylane.interfaces.jacobian_products`.
   [(#4435)](https://github.com/PennyLaneAI/pennylane/pull/4435)
+  [(#4527)](https://github.com/PennyLaneAI/pennylane/pull/4527)
 
 * Extended ``qml.qchem.import_state`` to import wavefunctions from MPS DMRG and SHCI classical
   calculations performed with the Block2 and Dice libraries, incorporating new tests and wavefunction
diff --git a/pennylane/_device.py b/pennylane/_device.py
index c24400c2ec2..7658b06b49f 100644
--- a/pennylane/_device.py
+++ b/pennylane/_device.py
@@ -560,6 +560,9 @@ def execute_and_gradients(self, circuits, method="jacobian", **kwargs):
             tuple[list[array[float]], list[array[float]]]: Tuple containing list of measured value(s)
             and list of Jacobians. Returned Jacobians should be of shape ``(output_shape, num_params)``.
         """
+        if self.tracker.active:
+            self.tracker.update(execute_and_derivative_batches=1, derivatives=len(circuits))
+            self.tracker.record()
         gradient_method = getattr(self, method)
 
         res = []
@@ -593,6 +596,9 @@ def gradients(self, circuits, method="jacobian", **kwargs):
             list[array[float]]: List of Jacobians. Returned Jacobians should be of
             shape ``(output_shape, num_params)``.
         """
+        if self.tracker.active:
+            self.tracker.update(derivatives=len(circuits))
+            self.tracker.record()
         gradient_method = getattr(self, method)
         return [gradient_method(circuit, **kwargs) for circuit in circuits]
 
diff --git a/pennylane/gradients/hadamard_gradient.py b/pennylane/gradients/hadamard_gradient.py
index dbc99e35bb6..edac693b90e 100644
--- a/pennylane/gradients/hadamard_gradient.py
+++ b/pennylane/gradients/hadamard_gradient.py
@@ -207,6 +207,10 @@ def hadamard_grad(
     assert_no_state_returns(tape.measurements, transform_name)
     assert_no_variance(tape.measurements, transform_name)
     assert_no_tape_batching(tape, transform_name)
+    if len(tape.measurements) > 1 and tape.shots.has_partitioned_shots:
+        raise NotImplementedError(
+            "hadamard gradient does not support multiple measurements with partitioned shots."
+        )
 
     if argnum is None and not tape.trainable_params:
         return _no_trainable_grad(tape)
@@ -323,12 +327,14 @@ def _expval_hadamard_grad(tape, argnum, aux_wire):
     multi_params = len(tape.trainable_params) > 1
 
     def processing_fn(results):  # pylint: disable=too-many-branches
-        final_res = [
-            [qml.math.convert_like(2 * coeff * r, r) for r in res]
-            if isinstance(res, tuple)
-            else qml.math.convert_like(2 * coeff * res, res)
-            for coeff, res in zip(coeffs, results)
-        ]
+        """Post processing function for computing a hadamard gradient."""
+        final_res = []
+        for coeff, res in zip(coeffs, results):
+            if isinstance(res, tuple):
+                new_val = [qml.math.convert_like(2 * coeff * r, r) for r in res]
+            else:
+                new_val = qml.math.convert_like(2 * coeff * res, res)
+            final_res.append(new_val)
 
         # Post process for probs
         if measurements_probs:
diff --git a/pennylane/interfaces/__init__.py b/pennylane/interfaces/__init__.py
index fc8dda6dc92..b0698fd8364 100644
--- a/pennylane/interfaces/__init__.py
+++ b/pennylane/interfaces/__init__.py
@@ -48,6 +48,7 @@
 
     ~interfaces.jacobian_products.JacobianProductCalculator
     ~interfaces.jacobian_products.TransformJacobianProducts
+    ~interfaces.jacobian_products.DeviceDerivatives
 
 """
 from .execution import cache_execute, execute, INTERFACE_MAP, SUPPORTED_INTERFACES
diff --git a/pennylane/interfaces/jacobian_products.py b/pennylane/interfaces/jacobian_products.py
index 022c113f560..8854dc21852 100644
--- a/pennylane/interfaces/jacobian_products.py
+++ b/pennylane/interfaces/jacobian_products.py
@@ -18,7 +18,9 @@
 from functools import partial
 import inspect
 import logging
-from typing import Tuple, Callable, Optional
+from typing import Tuple, Callable, Optional, Union
+
+from cachetools import LRUCache
 
 import pennylane as qml
 from pennylane.tape import QuantumScript
@@ -30,6 +32,36 @@
 logger.addHandler(logging.NullHandler())
 
 
+def _compute_vjps(jacs, dys, tapes):
+    """Compute the vjps of multiple tapes, directly for a Jacobian and co-tangents dys."""
+    f = {True: qml.gradients.compute_vjp_multi, False: qml.gradients.compute_vjp_single}
+
+    vjps = []
+    for jac, dy, t in zip(jacs, dys, tapes):
+        multi = len(t.measurements) > 1
+        if t.shots.has_partitioned_shots:
+            shot_vjps = [f[multi](d, j) for d, j in zip(dy, jac)]
+            vjps.append(qml.math.sum(qml.math.stack(shot_vjps), axis=0))
+        else:
+            vjps.append(f[multi](dy, jac))
+
+    return tuple(vjps)
+
+
+def _compute_jvps(jacs, tangents, tapes):
+    """Compute the jvps of multiple tapes, directly for a Jacobian and tangents."""
+    f = {True: qml.gradients.compute_jvp_multi, False: qml.gradients.compute_jvp_single}
+
+    jvps = []
+    for jac, dx, t in zip(jacs, tangents, tapes):
+        multi = len(t.measurements) > 1
+        if t.shots.has_partitioned_shots:
+            jvps.append(tuple(f[multi](dx, j) for j in jac))
+        else:
+            jvps.append(f[multi](dx, jac))
+    return tuple(jvps)
+
+
 class JacobianProductCalculator(abc.ABC):
     """Provides methods for calculating the JVP/VJP between the Jacobians of tapes and tangents/cotangents."""
 
@@ -215,3 +247,303 @@ def compute_jacobian(self, tapes: Batch):
         )
         results = self._inner_execute(jac_tapes)
         return tuple(batch_post_processing(results))
+
+
+class DeviceDerivatives(JacobianProductCalculator):
+    """Calculate jacobian products via a device provided jacobian.  This class relies on either ``qml.Device.gradients`` or
+    ``qml.devices.Device.compute_derivatives``.
+
+    Args:
+
+        device (Union[pennylane.Device, pennylane.devices.Device]): the device for execution and derivatives.
+            Must support first order gradients with the requested configuration.
+        execution_config (pennylane.devices.ExecutionConfig): a datastructure containing the options needed to fully
+           describe the execution. Only used with :class:`pennylane.devices.Device` from the new device interface.
+        gradient_kwargs (dict): a dictionary of keyword arguments for the gradients. Only used with a :class:`~.pennylane.Device`
+            from the old device interface.
+
+    **Examples:**
+
+    >>> device = qml.device('default.qubit')
+    >>> config = qml.devices.ExecutionConfig(gradient_method="adjoint")
+    >>> jpc = DeviceDerivatives(device, config, {})
+
+    This same class can also be used with the old device interface.
+
+    >>> device = qml.device('lightning.qubit', wires=5)
+    >>> gradient_kwargs = {"method": "adjoint_jacobian"}
+    >>> jpc_lightning = DeviceDerivatives(device, gradient_kwargs=gradient_kwargs)
+
+    **Technical comments on caching and calculating the gradients on execution:**
+
+    In order to store results and Jacobians for the backward pass during the forward pass,
+    the ``_jacs_cache`` and ``_results_cache`` properties are ``LRUCache`` objects with a maximum size of 10.
+    In the current execution pipeline, only one batch will be used per instance, but a size of 10 adds some extra
+    flexibility for future uses.
+
+    Note that batches of identically looking :class:`~.QuantumScript` s that are different instances will be cached separately.
+    This is because the ``hash`` of  :class:`~.QuantumScript` is expensive, as it requires inspecting all its constituents,
+    which is not worth the effort in this case.
+
+    When a forward pass with :meth:`~.execute_and_cache_jacobian` is called, both the results and the jacobian for the object are stored.
+
+    >>> tape = qml.tape.QuantumScript([qml.RX(1.0, wires=0)], [qml.expval(qml.PauliZ(0))])
+    >>> batch = (tape, )
+    >>> with device.tracker:
+    ...     results = jpc.execute_and_cache_jacobian(batch )
+    >>> results
+    (0.5403023058681398,)
+    >>> device.tracker.totals
+    {'execute_and_derivative_batches': 1, 'executions': 1, 'derivatives': 1}
+    >>> jpc._jacs_cache
+    LRUCache({5660934048: (array(-0.84147098),)}, maxsize=10, currsize=1)
+
+    Then when the vjp, jvp, or jacobian is requested, that cached value is used instead of requesting from
+    the device again.
+
+    >>> with device.tracker:
+    ...     vjp = jpc.compute_vjp(batch , (0.5, ) )
+    >>> vjp
+    (array([-0.42073549]),)
+    >>> device.tracker.totals
+    {}
+
+    """
+
+    def __repr__(self):
+        return f"<DeviceDerivatives: {self._device.name}, {self._gradient_kwargs}, {self._execution_config}>"
+
+    def __init__(
+        self,
+        device: Union[qml.devices.Device, qml.Device],
+        execution_config: Optional["qml.devices.ExecutionConfig"] = None,
+        gradient_kwargs: dict = None,
+    ):
+        if gradient_kwargs is None:
+            gradient_kwargs = {}
+        if logger.isEnabledFor(logging.DEBUG):  # pragma: no cover
+            logger.debug(
+                "DeviceDerivatives created with (%s, %s, %s)",
+                device,
+                execution_config,
+                gradient_kwargs,
+            )
+
+        self._device = device
+        self._execution_config = execution_config
+        self._gradient_kwargs = gradient_kwargs
+
+        self._uses_new_device = not isinstance(device, qml.Device)
+
+        # only really need to keep most recent entry, but keeping 10 around just in case
+        self._results_cache = LRUCache(maxsize=10)
+        self._jacs_cache = LRUCache(maxsize=10)
+
+    def _dev_execute_and_compute_derivatives(self, tapes: Batch):
+        """
+        Converts tapes to numpy before computing the the results and derivatives on the device.
+
+        Dispatches between the two different device interfaces.
+        """
+        numpy_tapes = tuple(qml.transforms.convert_to_numpy_parameters(t) for t in tapes)
+        if self._uses_new_device:
+            return self._device.execute_and_compute_derivatives(numpy_tapes, self._execution_config)
+        return self._device.execute_and_gradients(numpy_tapes, **self._gradient_kwargs)
+
+    def _dev_execute(self, tapes: Batch):
+        """
+        Converts tapes to numpy before computing just the results on the device.
+
+        Dispatches between the two different device interfaces.
+        """
+        numpy_tapes = tuple(qml.transforms.convert_to_numpy_parameters(t) for t in tapes)
+        if self._uses_new_device:
+            return self._device.execute(numpy_tapes, self._execution_config)
+        return self._device.batch_execute(numpy_tapes)
+
+    def _dev_compute_derivatives(self, tapes: Batch):
+        """
+        Converts tapes to numpy before computing the derivatives on the device.
+
+        Dispatches between the two different device interfaces.
+        """
+        numpy_tapes = tuple(qml.transforms.convert_to_numpy_parameters(t) for t in tapes)
+        if self._uses_new_device:
+            return self._device.compute_derivatives(numpy_tapes, self._execution_config)
+        return self._device.gradients(numpy_tapes, **self._gradient_kwargs)
+
+    def execute_and_cache_jacobian(self, tapes: Batch):
+        """Forward pass used to cache the results and jacobians.
+
+        Args:
+            tapes (tuple[`~.QuantumScript`]): the batch of tapes to execute and take derivatives of
+
+        Returns:
+            ResultBatch: the results of the execution.
+
+        Side Effects:
+            Caches both the results and jacobian into ``_results_cache`` and ``_jacs_cache``.
+
+        """
+        if logger.isEnabledFor(logging.DEBUG):  # pragma: no cover
+            logger.debug("Forward pass called with %s", tapes)
+        results, jac = self._dev_execute_and_compute_derivatives(tapes)
+        self._results_cache[tapes] = results
+        self._jacs_cache[tapes] = jac
+        return results
+
+    def execute_and_compute_jvp(self, tapes: Batch, tangents):
+        """Calculate both the results for a batch of tapes and the jvp.
+
+        This method is required to compute JVPs in the JAX interface.
+
+        Args:
+            tapes (tuple[`~.QuantumScript`]): The batch of tapes to take the derivatives of
+            tangents (Sequence[Sequence[TensorLike]]): the tangents for the parameters of the tape.
+                The ``i`` th tangent corresponds to the ``i`` th tape, and the ``j`` th entry into a
+                tangent entry corresponds to the ``j`` th trainable parameter of the tape.
+
+        Returns:
+            ResultBatch, TensorLike: the results of the execution and the jacobian vector product
+
+        Side Effects:
+            caches newly computed results or jacobians if they were not already cached.
+
+        **Examples:**
+
+        For an instance of :class:`~.DeviceDerivatives` ``jpc``, we have:
+
+        >>> tape0 = qml.tape.QuantumScript([qml.RX(0.1, wires=0)], [qml.expval(qml.PauliZ(0))])
+        >>> tape1 = qml.tape.QuantumScript([qml.RY(0.2, wires=0)], [qml.expval(qml.PauliZ(0))])
+        >>> batch = (tape0, tape1)
+        >>> tangents0 = (1.5, )
+        >>> tangents1 = (2.0, )
+        >>> tangents = (tangents0, tangents1)
+        >>> results, jvps = jpc.execute_and_compute_jvp(batch, tangents)
+        >>> expected_results = (np.cos(0.1), np.cos(0.2))
+        >>> qml.math.allclose(results, expected_results)
+        True
+        >>> jvps
+        (array(-0.14975012), array(-0.39733866))
+        >>> expected_jvps = 1.5 * -np.sin(0.1), 2.0 * -np.sin(0.2)
+        >>> qml.math.allclose(jvps, expected_jvps)
+        True
+
+        While this method could support non-scalar parameters in theory, no implementation currently supports
+        jacobians with non-scalar parameters.
+
+        """
+        if tapes not in self._results_cache and tapes not in self._jacs_cache:
+            results, jacs = self._dev_execute_and_compute_derivatives(tapes)
+            self._results_cache[tapes] = results
+            self._jacs_cache[tapes] = jacs
+        else:
+            if tapes in self._results_cache:
+                if logger.isEnabledFor(logging.DEBUG):  # pragma: no cover
+                    logger.debug("%s : Retrieving results from cache.", self)
+                results = self._results_cache[tapes]
+            else:
+                results = self._dev_execute(tapes)
+                self._results_cache[tapes] = results
+
+            if tapes in self._jacs_cache:
+                if logger.isEnabledFor(logging.DEBUG):  # pragma: no cover
+                    logger.debug("%s : Retrieving jacobian from cache.", self)
+                jacs = self._jacs_cache[tapes]
+            else:
+                # Here the jac was not cached but the results were. This can not happen because results are never
+                # cached alone (note that in the else clause above computing only results, jac must already be present)
+                raise NotImplementedError(
+                    "No path to cache results without caching jac. This branch should not occur."
+                )
+
+        jvps = _compute_jvps(jacs, tangents, tapes)
+        return results, jvps
+
+    def compute_vjp(self, tapes, dy):
+        """Compute the vjp for a given batch of tapes.
+
+        This method is used by autograd, torch, and tensorflow to compute VJPs.
+
+        Args:
+            tapes (tuple[`~.QuantumScript`]): the batch of tapes to take the derivatives of
+            dy (tuple[tuple[TensorLike]]): the derivatives of the results of an execution.
+                The ``i`` th entry (cotangent) corresponds to the ``i`` th tape, and the ``j`` th entry of the ``i`` th
+                cotangent corresponds to the ``j`` th return value of the ``i`` th tape.
+
+        Returns:
+            TensorLike: the vector jacobian product.
+
+        Side Effects:
+            caches the newly computed jacobian if it wasn't already present in the cache.
+
+        **Examples:**
+
+        For an instance of :class:`~.DeviceDerivatives` ``jpc``, we have:
+
+        >>> tape0 = qml.tape.QuantumScript([qml.RX(0.1, wires=0)], [qml.expval(qml.PauliZ(0))])
+        >>> tape1 = qml.tape.QuantumScript([qml.RY(0.2, wires=0)], [qml.expval(qml.PauliZ(0)), qml.expval(qml.PauliX(0))])
+        >>> batch = (tape0, tape1)
+        >>> dy0 = (0.5, )
+        >>> dy1 = (2.0, 3.0)
+        >>> dys = (dy0, dy1)
+        >>> vjps = jpc.compute_vjp(batch, dys)
+        >>> vjps
+        (array([-0.04991671]), array([2.54286107]))
+        >>> expected_vjp0 = 0.5 * -np.sin(0.1)
+        >>> qml.math.allclose(vjps[0], expected_vjp0)
+        True
+        >>> expected_jvp1 = 2.0 * -np.sin(0.2) + 3.0 * np.cos(0.2)
+        >>> qml.math.allclose(vjps[1], expected_vjp1)
+        True
+
+        While this method could support non-scalar parameters in theory, no implementation currently supports
+        jacobians with non-scalar parameters.
+
+        """
+        if tapes in self._jacs_cache:
+            if logger.isEnabledFor(logging.DEBUG):  # pragma: no cover
+                logger.debug(" %s : Retrieving jacobian from cache.", self)
+            jacs = self._jacs_cache[tapes]
+        else:
+            jacs = self._dev_compute_derivatives(tapes)
+            self._jacs_cache[tapes] = jacs
+
+        return _compute_vjps(jacs, dy, tapes)
+
+    def compute_jacobian(self, tapes):
+        """Compute the full Jacobian for a batch of tapes.
+
+        This method is required to compute Jacobians in the ``jax-jit`` interface
+
+        Args:
+            tapes: the batch of tapes to take the Jacobian of
+
+        Returns:
+            TensorLike: the full jacobian
+
+        Side Effects:
+            caches the newly computed jacobian if it wasn't already present in the cache.
+
+        **Examples:**
+
+        For an instance of :class:`~.DeviceDerivatives` ``jpc``, we have:
+
+        >>> tape0 = qml.tape.QuantumScript([qml.RX(0.1, wires=0)], [qml.expval(qml.PauliZ(0))])
+        >>> tape1 = qml.tape.QuantumScript([qml.RY(0.2, wires=0)], [qml.expval(qml.PauliZ(0)), qml.expval(qml.PauliX(0))])
+        >>> batch = (tape0, tape1)
+        >>> jpc.compute_jacobian(batch)
+        (array(-0.09983342), (array(-0.19866933), array(0.98006658)))
+
+        While this method could support non-scalar parameters in theory, no implementation currently supports
+        jacobians with non-scalar parameters.
+
+        """
+        if tapes in self._jacs_cache:
+            if logger.isEnabledFor(logging.DEBUG):  # pragma: no cover
+                logger.debug("%s : Retrieving jacobian from cache.", self)
+            return self._jacs_cache[tapes]
+
+        jacs = self._dev_compute_derivatives(tapes)
+        self._jacs_cache[tapes] = jacs
+        return jacs
diff --git a/tests/gradients/core/test_hadamard_gradient.py b/tests/gradients/core/test_hadamard_gradient.py
index 1c0ba0e2db6..339385c304e 100644
--- a/tests/gradients/core/test_hadamard_gradient.py
+++ b/tests/gradients/core/test_hadamard_gradient.py
@@ -74,6 +74,16 @@ def test_batched_tape_raises(self):
         with pytest.raises(NotImplementedError, match=_match):
             qml.gradients.hadamard_grad(tape)
 
+    def test_tape_with_partitioned_shots_multiple_measurements_raises(self):
+        """Test that an error is raised with multiple measurements and partitioned shots."""
+        tape = qml.tape.QuantumScript(
+            [qml.RX(0.1, wires=0)],
+            [qml.expval(qml.PauliZ(0)), qml.expval(qml.PauliY(0))],
+            shots=(1000, 10000),
+        )
+        with pytest.raises(NotImplementedError):
+            qml.gradients.hadamard_grad(tape)
+
     @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
     @pytest.mark.parametrize("G", [qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.U1])
     def test_pauli_rotation_gradient(self, G, theta, tol):
diff --git a/tests/interfaces/test_jacobian_products.py b/tests/interfaces/test_jacobian_products.py
index b8a45d4afb1..aae81d45758 100644
--- a/tests/interfaces/test_jacobian_products.py
+++ b/tests/interfaces/test_jacobian_products.py
@@ -16,6 +16,8 @@
 """
 # pylint: disable=protected-access
 import pytest
+from cachetools import LRUCache
+from param_shift_dev import ParamShiftDerivativesDevice
 
 import numpy as np
 
@@ -23,9 +25,14 @@
 from pennylane.interfaces.jacobian_products import (
     JacobianProductCalculator,
     TransformJacobianProducts,
+    DeviceDerivatives,
 )
 
-dev = qml.devices.DefaultQubit()
+dev = qml.device("default.qubit")
+dev_old = qml.device("default.qubit.legacy", wires=5)
+adjoint_config = qml.devices.ExecutionConfig(gradient_method="adjoint")
+dev_ps = ParamShiftDerivativesDevice()
+ps_config = qml.devices.ExecutionConfig(gradient_method="parameter-shift")
 
 
 def inner_execute_numpy(tapes):
@@ -36,8 +43,25 @@ def inner_execute_numpy(tapes):
 hadamard_grad_jpc = TransformJacobianProducts(
     inner_execute_numpy, qml.gradients.hadamard_grad, {"aux_wire": "aux"}
 )
+device_jacs = DeviceDerivatives(dev, adjoint_config)
+legacy_device_jacs = DeviceDerivatives(dev_old, gradient_kwargs={"method": "adjoint_jacobian"})
+device_ps_jacs = DeviceDerivatives(dev_ps, ps_config)
 
-jpc_matrix = [param_shift_jpc, hadamard_grad_jpc]
+transform_jpc_matrix = [param_shift_jpc, hadamard_grad_jpc]
+dev_jpc_matrix = [device_jacs, legacy_device_jacs, device_ps_jacs]
+jpc_matrix = [param_shift_jpc, hadamard_grad_jpc, device_jacs, legacy_device_jacs, device_ps_jacs]
+
+
+def _accepts_finite_shots(jpc):
+    if isinstance(jpc, TransformJacobianProducts):
+        return True
+    if isinstance(jpc, DeviceDerivatives):
+        return isinstance(jpc._device, ParamShiftDerivativesDevice)
+    return False
+
+
+def _tol_for_shots(shots):
+    return 0.05 if shots else 1e-6
 
 
 # pylint: disable=too-few-public-methods
@@ -62,34 +86,368 @@ def test_transform_jacobian_product_basics(self):
         )
         assert repr(jpc) == expected_repr
 
+    def test_device_jacobians_initialization_new_dev(self):
+        """Tests the private attributes are set during initialization of a DeviceDerivatives class."""
+
+        device = qml.device("default.qubit")
+        config = qml.devices.ExecutionConfig(gradient_method="adjoint")
+
+        jpc = DeviceDerivatives(device, config)
+
+        assert jpc._device is device
+        assert jpc._execution_config is config
+        assert jpc._gradient_kwargs == {}
+        assert jpc._uses_new_device is True
+        assert isinstance(jpc._results_cache, LRUCache)
+        assert len(jpc._results_cache) == 0
+        assert isinstance(jpc._jacs_cache, LRUCache)
+        assert len(jpc._jacs_cache) == 0
+
+    def test_device_jacobians_initialization_old_dev(self):
+        """Test the private attributes are set during initialization of a DeviceDerivatives class with the
+        old device interface."""
+
+        device = qml.devices.DefaultQubitLegacy(wires=5)
+        gradient_kwargs = {"method": "adjoint_jacobian"}
+
+        jpc = DeviceDerivatives(device, gradient_kwargs=gradient_kwargs)
+
+        assert jpc._device is device
+        assert jpc._gradient_kwargs == gradient_kwargs
+        assert jpc._uses_new_device is False
+        assert isinstance(jpc._results_cache, LRUCache)
+        assert len(jpc._results_cache) == 0
+        assert isinstance(jpc._jacs_cache, LRUCache)
+        assert len(jpc._jacs_cache) == 0
+
+    def test_device_jacobians_repr(self):
+        """Test the repr method for device jacobians."""
+        device = qml.device("default.qubit")
+        config = qml.devices.ExecutionConfig(gradient_method="adjoint")
+
+        jpc = DeviceDerivatives(device, config)
+
+        expected = (
+            r"<DeviceDerivatives: default.qubit, {},"
+            r" ExecutionConfig(grad_on_execution=None, use_device_gradient=None,"
+            r" gradient_method='adjoint', gradient_keyword_arguments={},"
+            r" device_options={}, interface=None, derivative_order=1)>"
+        )
+
+        assert repr(jpc) == expected
+
 
 @pytest.mark.parametrize("jpc", jpc_matrix)
+@pytest.mark.parametrize("shots", (None, 10000, (10000, 10000)))
 class TestJacobianProductResults:
     """Test first order results for the matrix of jpc options."""
 
-    def test_execute_jvp_basic(self, jpc):
+    def test_execute_jvp_basic(self, jpc, shots):
         """Test execute_and_compute_jvp for a simple single input single output."""
+        if shots and not _accepts_finite_shots(jpc):
+            pytest.skip("jpc does not work with finite shots.")
+
         x = 0.92
-        tape = qml.tape.QuantumScript([qml.RX(x, 0)], [qml.expval(qml.PauliZ(0))])
+        tape = qml.tape.QuantumScript([qml.RX(x, 0)], [qml.expval(qml.PauliZ(0))], shots=shots)
         tangents = ((0.5,),)
         res, jvp = jpc.execute_and_compute_jvp((tape,), tangents)
-        assert qml.math.allclose(res[0], np.cos(x))
-        assert qml.math.allclose(jvp[0], -0.5 * np.sin(x))
 
-    def test_vjp_basic(self, jpc):
+        if tape.shots.has_partitioned_shots:
+            assert len(res[0]) == 2
+            assert len(jvp[0]) == 2
+        else:
+            assert qml.math.shape(res[0]) == tuple()
+            assert qml.math.shape(jvp[0]) == tuple()
+
+        assert qml.math.allclose(res[0], np.cos(x), atol=_tol_for_shots(shots))
+        assert qml.math.allclose(jvp[0], -0.5 * np.sin(x), atol=_tol_for_shots(shots))
+
+        if tape.shots.has_partitioned_shots:
+            assert qml.math.allclose(res[0][1], np.cos(x), atol=_tol_for_shots(shots))
+            assert qml.math.allclose(jvp[0][1], -0.5 * np.sin(x), atol=_tol_for_shots(shots))
+
+    def test_vjp_basic(self, jpc, shots):
         """Test compute_vjp for a simple single input single output."""
+        if shots and not _accepts_finite_shots(jpc):
+            pytest.skip("jpc does not work with finite shots.")
+
         x = -0.294
-        tape = qml.tape.QuantumScript([qml.RX(x, 0)], [qml.expval(qml.PauliZ(0))])
-        dy = ((1.8,),)
+        tape = qml.tape.QuantumScript([qml.RX(x, 0)], [qml.expval(qml.PauliZ(0))], shots=shots)
+
+        dy = ((1.1, 0.7),) if tape.shots.has_partitioned_shots else (1.8,)
         vjp = jpc.compute_vjp((tape,), dy)
-        assert qml.math.allclose(vjp[0], -1.8 * np.sin(x))
 
-    def test_jacobian_basic(self, jpc):
+        assert qml.math.allclose(vjp[0], -1.8 * np.sin(x), atol=_tol_for_shots(shots))
+
+    def test_jacobian_basic(self, jpc, shots):
         """Test compute_jacobian for a simple single input single output."""
+        if shots and not _accepts_finite_shots(jpc):
+            pytest.skip("jpc does not work with finite shots.")
+
         x = 1.62
-        tape = qml.tape.QuantumScript([qml.RX(x, 0)], [qml.expval(qml.PauliZ(0))])
+        tape = qml.tape.QuantumScript([qml.RX(x, 0)], [qml.expval(qml.PauliZ(0))], shots=shots)
         jac = jpc.compute_jacobian((tape,))
-        assert qml.math.allclose(jac, -np.sin(x))
+        assert qml.math.allclose(jac, -np.sin(x), atol=_tol_for_shots(shots))
+
+    def test_batch_execute_jvp(self, jpc, shots):
+        """Test execute_and_compute_jvp on a batch with ragged observables and parameters.."""
+        if shots and not _accepts_finite_shots(jpc):
+            pytest.skip("jpc does not work with finite shots.")
+        x = -0.92
+        y = 0.84
+        phi = 1.62
+
+        tape1 = qml.tape.QuantumScript(
+            [qml.RX(x, 0), qml.RY(y, 1), qml.CNOT((0, 1))],
+            [qml.expval(qml.PauliX(1)), qml.expval(qml.PauliY(0))],
+        )
+        tape2 = qml.tape.QuantumScript(
+            [qml.Hadamard(0), qml.IsingXX(phi, wires=(0, 1))],
+            [qml.expval(qml.PauliZ(1))],
+            shots=shots,
+        )
+
+        tangents = ((2.0, 3.0), (0.5,))
+
+        res, jvps = jpc.execute_and_compute_jvp((tape1, tape2), tangents)
+
+        assert qml.math.allclose(res[0][0], np.sin(y), atol=_tol_for_shots(shots))
+        assert qml.math.allclose(res[0][1], -np.sin(x) * np.sin(y), atol=_tol_for_shots(shots))
+        assert qml.math.allclose(res[1], np.cos(phi), atol=_tol_for_shots(shots))
+
+        assert qml.math.allclose(jvps[0][0], 3.0 * np.cos(y), atol=_tol_for_shots(shots))
+        assert qml.math.allclose(
+            jvps[0][1],
+            -2.0 * np.cos(x) * np.sin(y) - 3.0 * np.sin(x) * np.cos(y),
+            atol=_tol_for_shots(shots),
+        )
+        assert qml.math.allclose(jvps[1], -0.5 * np.sin(phi), atol=_tol_for_shots(shots))
+
+    def test_batch_vjp(self, jpc, shots):
+        """Test compute_vjp on a batch with ragged observables and parameters."""
+
+        if shots and not _accepts_finite_shots(jpc):
+            pytest.skip("jpc does not work with finite shots.")
+        if jpc is hadamard_grad_jpc and qml.measurements.Shots(shots).has_partitioned_shots:
+            pytest.skip(
+                "hadamard gradient does not support multiple measurments with partitioned shots."
+            )
+
+        x = 0.385
+        y = 1.92
+        phi = -1.05
+
+        tape1 = qml.tape.QuantumScript(
+            [qml.RX(x, 0), qml.RY(y, 1), qml.CNOT((0, 1))],
+            [qml.expval(qml.PauliX(1)), qml.expval(qml.PauliY(0))],
+            shots=shots,
+        )
+        tape2 = qml.tape.QuantumScript(
+            [qml.Hadamard(0), qml.IsingXX(phi, wires=(0, 1))],
+            [qml.expval(qml.PauliZ(1))],
+            shots=shots,
+        )
+
+        if tape1.shots.has_partitioned_shots:
+            dy1 = ((0.3, 0.2), (0.2, 0.4))
+            dy2 = (0.4, 0.5)
+            dy = (dy1, dy2)
+        else:
+            dy = ((0.5, 0.6), (0.9,))
+
+        vjps = jpc.compute_vjp((tape1, tape2), dy)
+
+        assert qml.math.allclose(
+            vjps[0][0], -0.6 * np.cos(x) * np.sin(y), atol=_tol_for_shots(shots)
+        )  # dx
+        assert qml.math.allclose(
+            vjps[0][1], 0.5 * np.cos(y) - 0.6 * np.sin(x) * np.cos(y), atol=_tol_for_shots(shots)
+        )  # dy
+        assert qml.math.allclose(vjps[1], -0.9 * np.sin(phi), atol=_tol_for_shots(shots))
+
+    def test_batch_jacobian(self, jpc, shots):
+        """Test compute_jacobian on a batch with ragged observables and parameters."""
+
+        if shots and not _accepts_finite_shots(jpc):
+            pytest.skip("jpc does not work with finite shots.")
+        if jpc is hadamard_grad_jpc and qml.measurements.Shots(shots).has_partitioned_shots:
+            pytest.skip(
+                "hadamard gradient does not work with partitioned shots and multiple measurements."
+            )
+
+        x = np.array(0.28)
+        y = np.array(1.62)
+        phi = np.array(0.6293)
+
+        tape1 = qml.tape.QuantumScript(
+            [qml.RX(x, 0), qml.RY(y, 1), qml.CNOT((0, 1))],
+            [qml.expval(qml.PauliX(1)), qml.expval(qml.PauliY(0))],
+            shots=shots,
+        )
+        tape2 = qml.tape.QuantumScript(
+            [qml.Hadamard(0), qml.IsingXX(phi, wires=(0, 1))],
+            [qml.expval(qml.PauliZ(1))],
+            shots=shots,
+        )
+
+        # note reversed order of tapes in this test
+        jacs = jpc.compute_jacobian((tape2, tape1))
+
+        if tape1.shots.has_partitioned_shots:
+            for i in [0, 1]:
+                assert qml.math.allclose(jacs[0][i], -np.sin(phi), atol=_tol_for_shots(shots))
+                assert qml.math.allclose(jacs[1][i][0][0], 0, atol=_tol_for_shots(shots))
+                assert qml.math.allclose(jacs[1][i][0][1], np.cos(y), atol=_tol_for_shots(shots))
+                assert qml.math.allclose(
+                    jacs[1][i][1][0], -np.cos(x) * np.sin(y), atol=_tol_for_shots(shots)
+                )
+                assert qml.math.allclose(
+                    jacs[1][i][1][1], -np.sin(x) * np.cos(y), atol=_tol_for_shots(shots)
+                )
+        else:
+            assert qml.math.allclose(jacs[0], -np.sin(phi), atol=_tol_for_shots(shots))
+            assert qml.math.allclose(jacs[1][0][0], 0, atol=_tol_for_shots(shots))
+            assert qml.math.allclose(jacs[1][0][1], np.cos(y), atol=_tol_for_shots(shots))
+            assert qml.math.allclose(
+                jacs[1][1][0], -np.cos(x) * np.sin(y), atol=_tol_for_shots(shots)
+            )
+            assert qml.math.allclose(
+                jacs[1][1][1], -np.sin(x) * np.cos(y), atol=_tol_for_shots(shots)
+            )
+
+
+@pytest.mark.parametrize("jpc", dev_jpc_matrix)
+class TestCachingDeviceDerivatives:
+    """Test caching for device jacobians."""
+
+    def test_execution_caching(self, jpc):
+        """Test that results and jacobians are cached on calls to execute."""
+        tape1 = qml.tape.QuantumScript([qml.RX(0.1, wires=0)], [qml.expval(qml.PauliZ(0))])
+        batch = (tape1,)
+
+        with jpc._device.tracker:
+            results = jpc.execute_and_cache_jacobian(batch)
+
+        assert qml.math.allclose(results[0], np.cos(0.1))
+        assert jpc._device.tracker.totals["execute_and_derivative_batches"] == 1
+        assert jpc._device.tracker.totals["derivatives"] == 1
+
+        # extra execution since needs to do the forward pass again.
+        if jpc._uses_new_device:
+            expected_execs = 3 if isinstance(jpc._device, ParamShiftDerivativesDevice) else 1
+        else:
+            expected_execs = 2
+        assert jpc._device.tracker.totals["executions"] == expected_execs
+
+        # Test reuse with jacobian
+        with jpc._device.tracker:
+            jac = jpc.compute_jacobian(batch)
+
+        assert qml.math.allclose(jac, -np.sin(0.1))
+        assert jpc._device.tracker.totals.get("derivatives", 0) == 0
+        assert jpc._device.tracker.totals.get("executions", 0) == 0
+
+        # Test reuse with execute_and_compute_jvp
+        with jpc._device.tracker:
+            res2, jvp = jpc.execute_and_compute_jvp(batch, ((0.5,),))
+
+        assert qml.math.allclose(res2, results)
+        assert qml.math.allclose(jvp, 0.5 * -np.sin(0.1))
+        assert jpc._device.tracker.totals.get("derivatives", 0) == 0
+        assert jpc._device.tracker.totals.get("executions", 0) == 0
+
+        # Test reuse with compute_vjp
+        with jpc._device.tracker:
+            vjp = jpc.compute_vjp(batch, ((1.5,),))
+
+        assert qml.math.allclose(vjp, -1.5 * np.sin(0.1))
+        assert jpc._device.tracker.totals.get("derivatives", 0) == 0
+        assert jpc._device.tracker.totals.get("executions", 0) == 0
+
+        # Test device called again if batch a new instance, even if identical
+        tape2 = qml.tape.QuantumScript([qml.RX(0.1, wires=0)], [qml.expval(qml.PauliZ(0))])
+        batch2 = (tape2,)
+
+        with jpc._device.tracker:
+            jac2 = jpc.compute_jacobian(batch2)
+
+        assert qml.math.allclose(jac, jac2)
+        assert jpc._device.tracker.totals["derivatives"] == 1
+        if jpc._uses_new_device:
+            expected_execs = 2 if isinstance(jpc._device, ParamShiftDerivativesDevice) else 0
+        else:
+            expected_execs = 1
+        assert jpc._device.tracker.totals.get("executions", 0) == expected_execs
+
+    def test_cached_on_execute_and_compute_jvps(self, jpc):
+        """Test that execute_and_compute_jvp caches results and Jacobians if they are not precalculated."""
+        tape1 = qml.tape.QuantumScript(
+            [qml.Hadamard(0), qml.IsingXX(0.8, wires=(0, 1))], [qml.expval(qml.PauliZ(1))]
+        )
+        batch = (tape1,)
+        tangents = ((0.5,),)
+
+        with jpc._device.tracker:
+            res, jvps = jpc.execute_and_compute_jvp(batch, tangents)
+
+        assert jpc._device.tracker.totals["execute_and_derivative_batches"] == 1
+
+        assert qml.math.allclose(res, np.cos(0.8))
+        assert qml.math.allclose(jvps, -0.5 * np.sin(0.8))
+
+        assert jpc._results_cache[batch] is res
+        assert qml.math.allclose(jpc._jacs_cache[batch], (-np.sin(0.8)))
+
+        with jpc._device.tracker:
+            jpc.execute_and_compute_jvp(batch, tangents)
+
+        assert jpc._device.tracker.totals.get("derivatives", 0) == 0
+        assert jpc._device.tracker.totals.get("executions", 0) == 0
+
+    def test_cached_on_vjps(self, jpc):
+        """test that only jacs are cached on calls to compute_vjp."""
+
+        tape1 = qml.tape.QuantumScript([qml.RZ(0.5, wires=0)], [qml.expval(qml.PauliX(0))])
+        batch = (tape1,)
+        dy = ((0.5,),)
+
+        with jpc._device.tracker:
+            jpc.compute_vjp(batch, dy)
+
+        if isinstance(jpc._device, ParamShiftDerivativesDevice):
+            expected = 2
+        elif isinstance(jpc._device, qml.devices.Device):
+            expected = 0
+        else:
+            expected = 1
+
+        assert jpc._device.tracker.totals.get("executions", 0) == expected
+
+        assert batch not in jpc._results_cache
+        assert qml.math.allclose(jpc._jacs_cache[batch], 0)
+
+        with jpc._device.tracker:
+            jpc.execute_and_compute_jvp(batch, ((0.5,),))
+
+        assert jpc._device.tracker.totals["executions"] == 1
+        assert jpc._device.tracker.totals.get("derivatives", 0) == 0
+        assert qml.math.allclose(jpc._results_cache[batch], 0)
+
+    def test_error_cant_cache_results_without_jac(self, jpc):
+        """Test that a NotImplementedError is raised if somehow the results are cached
+        without the jac being cached and execute_and_compute_jvp is called."""
+
+        tape = qml.tape.QuantumScript([], [qml.state()])
+        batch = (tape,)
+        jpc._results_cache[batch] = "value"
+
+        with pytest.raises(NotImplementedError):
+            jpc.execute_and_compute_jvp(batch, tuple())
+
+
+@pytest.mark.parametrize("jpc", transform_jpc_matrix + [device_ps_jacs])
+class TestProbsTransformJacobians:
+    """Testing results when probabilities are returned. This only works with gradient transforms."""
 
     def test_execute_jvp_multi_params_multi_out(self, jpc):
         """Test execute_and_compute_jvp with multiple parameters and multiple outputs"""
diff --git a/tests/param_shift_dev.py b/tests/param_shift_dev.py
new file mode 100644
index 00000000000..e1d95f6b2f5
--- /dev/null
+++ b/tests/param_shift_dev.py
@@ -0,0 +1,75 @@
+# Copyright 2018-2023 Xanadu Quantum Technologies Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file provides a device that calculates derivatives via parameter shift.
+"""
+
+import dataclasses
+
+import pennylane as qml
+
+
+# pylint: disable=unused-argument
+class ParamShiftDerivativesDevice(qml.devices.DefaultQubit):
+    """This device provides derivatives via parameter shift."""
+
+    name = "param_shift.qubit"
+
+    def preprocess(self, execution_config=qml.devices.DefaultExecutionConfig):
+        if config.gradient_method in {"device", "parameter-shift"}:
+            config = dataclasses.replace(config, use_device_gradient=True)
+        return super().preprocess(config)
+
+    def supports_derivatives(self, execution_config=None, circuit=None):
+        if execution_config is None:
+            return True
+        return execution_config.gradient_method in {"device", "parameter-shift"}
+
+    def compute_derivatives(self, circuits, execution_config=None):
+        is_single_circuit = False
+        if isinstance(circuits, qml.tape.QuantumScript):
+            is_single_circuit = True
+            circuits = (circuits,)
+
+        if self.tracker.active:
+            self.tracker.update(derivative_batches=1, derivatives=len(circuits))
+            self.tracker.record()
+
+        diff_batch, fn = qml.transforms.map_batch_transform(qml.gradients.param_shift, circuits)
+        diff_results = self.execute(diff_batch)
+
+        jacs = fn(diff_results)
+        return jacs[0] if is_single_circuit else jacs
+
+    def execute_and_compute_derivatives(self, circuits, execution_config=None):
+        is_single_circuit = False
+        if isinstance(circuits, qml.tape.QuantumScript):
+            is_single_circuit = True
+            circuits = (circuits,)
+
+        if self.tracker.active:
+            for c in circuits:
+                self.tracker.update(resources=c.specs["resources"])
+            self.tracker.update(
+                execute_and_derivative_batches=1,
+                derivatives=len(circuits),
+            )
+            self.tracker.record()
+
+        diff_batch, fn = qml.transforms.map_batch_transform(qml.gradients.param_shift, circuits)
+        combined_batch = tuple(circuits) + tuple(diff_batch)
+        all_results = self.execute(combined_batch)
+        results = all_results[: len(circuits)]
+        jacs = fn(all_results[len(circuits) :])
+        return (results[0], jacs[0]) if is_single_circuit else (results, jacs)
diff --git a/tests/test_device.py b/tests/test_device.py
index c3c8b0c253d..b04db9813ec 100644
--- a/tests/test_device.py
+++ b/tests/test_device.py
@@ -206,6 +206,25 @@ def get_device(wires):
         yield get_device
 
 
+def test_gradients_record():
+    """Test that execute_and_gradients and gradient both track the number of gradients requested."""
+
+    dev = qml.device("default.qubit.legacy", wires=1)
+
+    tape = qml.tape.QuantumScript([qml.RX(0.1, wires=0)], [qml.expval(qml.PauliZ(0))])
+
+    with dev.tracker:
+        dev.execute_and_gradients((tape, tape), method="adjoint_jacobian", use_device_state=True)
+
+    assert dev.tracker.totals["execute_and_derivative_batches"] == 1
+    assert dev.tracker.totals["derivatives"] == 2
+
+    with dev.tracker:
+        dev.gradients((tape, tape), method="adjoint_jacobian", use_device_state=True)
+
+    assert dev.tracker.totals["derivatives"] == 2
+
+
 class TestDeviceSupportedLogic:
     """Test the logic associated with the supported operations and observables"""
 
diff --git a/tests/test_qubit_device.py b/tests/test_qubit_device.py
index c0600f64f75..ade01093ca8 100644
--- a/tests/test_qubit_device.py
+++ b/tests/test_qubit_device.py
@@ -1426,7 +1426,7 @@ def test_tracker_multi_execution(self, dev_name):
         ):
             assert tracked_r == expected_r
 
-    @pytest.mark.all_interfaces
+    @pytest.mark.autograd
     def test_tracker_grad(self):
         """Test that the tracker can track resources through a gradient computation"""
         dev = qml.device("default.qubit.legacy", wires=1, shots=100)