diff --git a/doc/releases/changelog-dev.md b/doc/releases/changelog-dev.md
index ac75e056a36..e5d0d5c23df 100644
--- a/doc/releases/changelog-dev.md
+++ b/doc/releases/changelog-dev.md
@@ -15,10 +15,11 @@
 * `qml.draw` now supports drawing mid-circuit measurements.
   [(#4775)](https://github.com/PennyLaneAI/pennylane/pull/4775)
 
-* Autograd can now use vjps provided by the device from the new device API. If a device provides
+* Autograd and torch can now use vjps provided by the device from the new device API. If a device provides
   a vector Jacobian product, this can be selected by providing `device_vjp=True` to
   `qml.execute`.
   [(#4557)](https://github.com/PennyLaneAI/pennylane/pull/4557)
+  [(#4654)](https://github.com/PennyLaneAI/pennylane/pull/4654)
 
 * Updates to some relevant Pytests to enable its use as a suite of benchmarks.
   [(#4703)](https://github.com/PennyLaneAI/pennylane/pull/4703)
diff --git a/pennylane/interfaces/execution.py b/pennylane/interfaces/execution.py
index 6c4e953e4df..4e1c5aa771b 100644
--- a/pennylane/interfaces/execution.py
+++ b/pennylane/interfaces/execution.py
@@ -43,7 +43,7 @@
 
 device_type = Union[qml.Device, "qml.devices.Device"]
 
-jpc_interfaces = {"autograd", "numpy"}
+jpc_interfaces = {"autograd", "numpy", "torch", "pytorch"}
 
 INTERFACE_MAP = {
     None: "Numpy",
diff --git a/pennylane/interfaces/jacobian_products.py b/pennylane/interfaces/jacobian_products.py
index 7307aee62d5..6cf385c221c 100644
--- a/pennylane/interfaces/jacobian_products.py
+++ b/pennylane/interfaces/jacobian_products.py
@@ -44,7 +44,6 @@ def _compute_vjps(jacs, dys, tapes):
             vjps.append(qml.math.sum(qml.math.stack(shot_vjps), axis=0))
         else:
             vjps.append(f[multi](dy, jac))
-
     return tuple(vjps)
 
 
diff --git a/pennylane/interfaces/torch.py b/pennylane/interfaces/torch.py
index 33e72e2c903..982c3150d5e 100644
--- a/pennylane/interfaces/torch.py
+++ b/pennylane/interfaces/torch.py
@@ -14,6 +14,51 @@
 """
 This module contains functions for adding the PyTorch interface
 to a PennyLane Device class.
+
+**How to bind a custom derivative with Torch.**
+
+See `the Torch documentation <https://pytorch.org/docs/stable/notes/extending.html>`_ for more complete
+information.
+
+Suppose I have a function ``f`` that I want to define a custom vjp for.
+
+We need to inherit from ``torch.autograd.Function`` and define ``forward`` and ``backward`` static
+methods.
+
+.. code-block:: python
+
+    class CustomFunction(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, x, exponent=2):
+            ctx.saved_info = {'x': x, 'exponent': exponent}
+            return x ** exponent
+
+        @staticmethod
+        def backward(ctx, dy):
+            x = ctx.saved_info['x']
+            exponent = ctx.saved_info['exponent']
+            print(f"Calculating the gradient with x={x}, dy={dy}, exponent={exponent}")
+            return dy * exponent * x ** (exponent-1), None
+
+To use the ``CustomFunction`` class, we call it with the static ``apply`` method.
+
+>>> val = torch.tensor(2.0, requires_grad=True)
+>>> res = CustomFunction.apply(val)
+>>> res
+tensor(4., grad_fn=<CustomFunctionBackward>)
+>>> res.backward()
+>>> val.grad
+Calculating the gradient with x=2.0, dy=1.0, exponent=2
+tensor(4.)
+
+Note that for custom functions, the output of ``forward`` and the output of ``backward`` are flattened iterables of
+Torch arrays.  While autograd and jax can handle nested result objects like ``((np.array(1), np.array(2)), np.array(3))``,
+torch requires that it be flattened like ``(np.array(1), np.array(2), np.array(3))``.  The ``pytreeify`` class decorator
+modifies the output of ``forward`` and the input to ``backward`` to unpack and repack the nested structure of the PennyLane
+result object.
+
+
 """
 # pylint: disable=too-many-arguments,protected-access,abstract-method
 import inspect
@@ -64,27 +109,6 @@ def new_backward(ctx, *flat_grad_outputs):
     return cls
 
 
-def _compute_vjps(dys, jacs, multi_measurements):
-    """Compute the vjps of multiple tapes, directly for a Jacobian and tangents."""
-    if logger.isEnabledFor(logging.DEBUG):
-        logger.debug(
-            "Entry with args=(dys=%s, jacs=%s, multi_measurements=%s) called by=%s",
-            dys,
-            jacs,
-            multi_measurements,
-            "::L".join(str(i) for i in inspect.getouterframes(inspect.currentframe(), 2)[1][1:3]),
-        )
-
-    vjps = []
-
-    for i, multi in enumerate(multi_measurements):
-        compute_func = (
-            qml.gradients.compute_vjp_multi if multi else qml.gradients.compute_vjp_single
-        )
-        vjps.extend(compute_func(dys[i], jacs[i]))
-    return vjps
-
-
 @pytreeify
 class ExecuteTapes(torch.autograd.Function):
     """The signature of this ``torch.autograd.Function`` is designed to
@@ -96,26 +120,17 @@ class ExecuteTapes(torch.autograd.Function):
       as the first argument ``kwargs``. This dictionary **must** contain:
 
       * ``"tapes"``: the quantum tapes to batch evaluate
-      * ``"device"``: the quantum device to use to evaluate the tapes
-      * ``"execute_fn"``: the execution function to use on forward passes
-      * ``"gradient_fn"``: the gradient transform function to use
-        for backward passes
-      * ``"gradient_kwargs"``: gradient keyword arguments to pass to the
-        gradient function
-      * ``"max_diff``: the maximum order of derivatives to support
+      * ``"execute_fn"``: a function that calculates the results of the tapes
+      * ``"jpc"``: a :class:`~.JacobianProductCalculator` that can compute the vjp.
 
     Further, note that the ``parameters`` argument is dependent on the
     ``tapes``; this function should always be called
     with the parameters extracted directly from the tapes as follows:
 
-    >>> parameters = []
-    >>> [parameters.extend(t.get_parameters()) for t in tapes]
-    >>> kwargs = {"tapes": tapes, "device": device, "gradient_fn": gradient_fn, ...}
+    >>> parameters = [p for t in tapes for p in t.get_parameters()]
+    >>> kwargs = {"tapes": tapes, "execute_fn": execute_fn, "jpc": jpc}
     >>> ExecuteTapes.apply(kwargs, *parameters)
 
-    The private argument ``_n`` is used to track nesting of derivatives, for example
-    if the nth-order derivative is requested. Do not set this argument unless you
-    understand the consequences!
     """
 
     @staticmethod
@@ -133,16 +148,9 @@ def forward(ctx, kwargs, *parameters):  # pylint: disable=arguments-differ
             )
 
         ctx.tapes = kwargs["tapes"]
-        ctx.device = kwargs["device"]
+        ctx.jpc = kwargs["jpc"]
 
-        ctx.execute_fn = kwargs["execute_fn"]
-        ctx.gradient_fn = kwargs["gradient_fn"]
-
-        ctx.gradient_kwargs = kwargs["gradient_kwargs"]
-        ctx.max_diff = kwargs["max_diff"]
-        ctx._n = kwargs.get("_n", 1)
-
-        res, ctx.jacs = ctx.execute_fn(ctx.tapes, **ctx.gradient_kwargs)
+        res = tuple(kwargs["execute_fn"](ctx.tapes))
 
         # if any input tensor uses the GPU, the output should as well
         ctx.torch_device = None
@@ -151,12 +159,7 @@ def forward(ctx, kwargs, *parameters):  # pylint: disable=arguments-differ
             if isinstance(p, torch.Tensor) and p.is_cuda:  # pragma: no cover
                 ctx.torch_device = p.get_device()
                 break
-
         res = tuple(_res_to_torch(r, ctx) for r in res)
-        for i, _ in enumerate(res):
-            # In place change of the numpy array Jacobians to Torch objects
-            _jac_to_torch(i, ctx)
-
         return res
 
     @staticmethod
@@ -173,124 +176,39 @@ def backward(ctx, *dy):
                 ),
             )
 
-        multi_measurements = [len(tape.measurements) > 1 for tape in ctx.tapes]
-
-        if ctx.jacs:
-            # Jacobians were computed on the forward pass (mode="forward")
-            # No additional quantum evaluations needed; simply compute the VJPs directly.
-            vjps = _compute_vjps(dy, ctx.jacs, multi_measurements)
-
-        else:
-            # Need to compute the Jacobians on the backward pass (accumulation="backward")
-
-            if isinstance(ctx.gradient_fn, qml.transforms.core.TransformDispatcher):
-                # Gradient function is a gradient transform.
-
-                # Generate and execute the required gradient tapes
-                if ctx._n < ctx.max_diff:
-                    # The derivative order is less than the max derivative order.
-                    # Compute the VJP recursively by using the gradient transform
-                    # and calling ``execute`` to compute the results.
-                    # This will allow higher-order derivatives to be computed
-                    # if requested.
-
-                    vjp_tapes, processing_fn = qml.gradients.batch_vjp(
-                        ctx.tapes,
-                        dy,
-                        ctx.gradient_fn,
-                        reduction="extend",
-                        gradient_kwargs=ctx.gradient_kwargs,
-                    )
-                    # This is where the magic happens. Note that we call ``execute``.
-                    # This recursion, coupled with the fact that the gradient transforms
-                    # are differentiable, allows for arbitrary order differentiation.
-                    res = execute(
-                        vjp_tapes,
-                        ctx.device,
-                        ctx.execute_fn,
-                        ctx.gradient_fn,
-                        ctx.gradient_kwargs,
-                        _n=ctx._n + 1,
-                        max_diff=ctx.max_diff,
-                    )
-                    vjps = processing_fn(res)
-
-                else:
-                    # The derivative order is at the maximum. Compute the VJP
-                    # in a non-differentiable manner to reduce overhead.
-                    vjp_tapes, processing_fn = qml.gradients.batch_vjp(
-                        ctx.tapes,
-                        dy,
-                        ctx.gradient_fn,
-                        reduction="extend",
-                        gradient_kwargs=ctx.gradient_kwargs,
-                    )
-
-                    vjps = processing_fn(ctx.execute_fn(vjp_tapes)[0])
-
-            else:
-                # Gradient function is not a gradient transform
-                # (e.g., it might be a device method).
-                # Note that unlike the previous branch:
-                #
-                # - there is no recursion here
-                # - gradient_fn is not differentiable
-                #
-                # so we cannot support higher-order derivatives.
-
-                jacs = ctx.gradient_fn(ctx.tapes, **ctx.gradient_kwargs)
-
-                vjps = _compute_vjps(dy, jacs, multi_measurements)
-
-        # Remove empty vjps (from tape with non trainable params)
-        vjps = [vjp for vjp in vjps if list(vjp.shape) != [0]]
+        vjps = ctx.jpc.compute_vjp(ctx.tapes, dy)
+
+        # split tensor into separate entries
+        unpacked_vjps = []
+        for vjp_slice in vjps:
+            if vjp_slice is not None and np.squeeze(vjp_slice).shape != (0,):
+                unpacked_vjps.extend(_res_to_torch(vjp_slice, ctx))
+        vjps = tuple(unpacked_vjps)
         # The output of backward must match the input of forward.
         # Therefore, we return `None` for the gradient of `kwargs`.
-        return (None,) + tuple(vjps)
+        return (None,) + vjps
 
 
-def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1, max_diff=1):
+def execute(tapes, execute_fn, jpc):
     """Execute a batch of tapes with Torch parameters on a device.
-    This function may be called recursively, if ``gradient_fn`` is a differentiable
-    transform, and ``_n < max_diff``.
 
     Args:
         tapes (Sequence[.QuantumTape]): batch of tapes to execute
-        device (pennylane.Device): Device to use to execute the batch of tapes.
-            If the device does not provide a ``batch_execute`` method,
-            by default the tapes will be executed in serial.
-        execute_fn (callable): The execution function used to execute the tapes
-            during the forward pass. This function must return a tuple ``(results, jacobians)``.
-            If ``jacobians`` is an empty list, then ``gradient_fn`` is used to
-            compute the gradients during the backwards pass.
-        gradient_kwargs (dict): dictionary of keyword arguments to pass when
-            determining the gradients of tapes
-        gradient_fn (callable): the gradient function to use to compute quantum gradients
-        _n (int): a positive integer used to track nesting of derivatives, for example
-            if the nth-order derivative is requested.
-        max_diff (int): If ``gradient_fn`` is a gradient transform, this option specifies
-            the maximum order of derivatives to support. Increasing this value allows
-            for higher order derivatives to be extracted, at the cost of additional
-            (classical) computational overhead during the backwards pass.
+        execute_fn (Callable[[Sequence[.QuantumTape]], ResultBatch]): a function that turns a batch of circuits into results
+        jpc (JacobianProductCalculator): a class that can compute the vector jacobian product for the input tapes.
+
     Returns:
-        list[list[torch.Tensor]]: A nested list of tape results. Each element in
-        the returned list corresponds in order to the provided tapes.
+        TensorLike: A nested tuple of tape results. Each element in
+        the returned tuple corresponds in order to the provided tapes.
     """
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(
-            "Entry with args=(tapes=%s, device=%s, execute_fn=%s, gradient_fn=%s, gradient_kwargs=%s, _n=%s, max_diff=%s) called by=%s",
+            "Entry with args=(tapes=%s, execute_fn=%s, jpc=%s",
             tapes,
-            repr(device),
-            execute_fn
-            if not (logger.isEnabledFor(qml.logging.TRACE) and inspect.isfunction(execute_fn))
-            else "\n" + inspect.getsource(execute_fn) + "\n",
-            gradient_fn
-            if not (logger.isEnabledFor(qml.logging.TRACE) and inspect.isfunction(gradient_fn))
-            else "\n" + inspect.getsource(gradient_fn) + "\n",
-            gradient_kwargs,
-            _n,
-            max_diff,
-            "::L".join(str(i) for i in inspect.getouterframes(inspect.currentframe(), 2)[1][1:3]),
+            f"\n{inspect.getsource(execute_fn)}\n"
+            if logger.isEnabledFor(qml.logging.TRACE)
+            else execute_fn,
+            jpc,
         )
 
     # pylint: disable=unused-argument
@@ -302,13 +220,9 @@ def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1, max_d
         parameters.extend(tape.get_parameters())
 
     kwargs = {
-        "tapes": tapes,
-        "device": device,
+        "tapes": tuple(tapes),
         "execute_fn": execute_fn,
-        "gradient_fn": gradient_fn,
-        "gradient_kwargs": gradient_kwargs,
-        "_n": _n,
-        "max_diff": max_diff,
+        "jpc": jpc,
     }
 
     return ExecuteTapes.apply(kwargs, *parameters)
@@ -316,49 +230,8 @@ def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1, max_d
 
 def _res_to_torch(r, ctx):
     """Convert results from unwrapped execution to torch."""
+    if isinstance(r, dict):
+        return r
     if isinstance(r, (list, tuple)):
-        res = []
-        for t in r:
-            if isinstance(t, dict) or isinstance(t, list) and all(isinstance(i, dict) for i in t):
-                # count result, single or broadcasted
-                res.append(t)
-            else:
-                if isinstance(t, tuple):
-                    res.append(tuple(torch.as_tensor(el, device=ctx.torch_device) for el in t))
-                else:
-                    res.append(torch.as_tensor(t, device=ctx.torch_device))
-        if isinstance(r, tuple):
-            res = tuple(res)
-    elif isinstance(r, dict):
-        res = r
-    else:
-        res = torch.as_tensor(r, device=ctx.torch_device)
-
-    return res
-
-
-def _jac_to_torch(i, ctx):
-    """Convert Jacobian from unwrapped execution to torch in the given ctx."""
-    if ctx.jacs:
-        ctx_jacs = list(ctx.jacs)
-        multi_m = len(ctx.tapes[i].measurements) > 1
-        multi_p = len(ctx.tapes[i].trainable_params) > 1
-
-        # Multiple measurements and parameters: Jacobian is a tuple of tuple
-        if multi_p and multi_m:
-            jacobians = []
-            for jacobian in ctx_jacs[i]:
-                inside_nested_jacobian = [
-                    torch.as_tensor(j, device=ctx.torch_device) for j in jacobian
-                ]
-                inside_nested_jacobian_tuple = tuple(inside_nested_jacobian)
-                jacobians.append(inside_nested_jacobian_tuple)
-            ctx_jacs[i] = tuple(jacobians)
-        # Single measurement and single parameter: Jacobian is a tensor
-        elif not multi_p and not multi_m:
-            ctx_jacs[i] = torch.as_tensor(np.array(ctx_jacs[i]), device=ctx.torch_device)
-        # Multiple measurements or multiple parameters: Jacobian is a tuple
-        else:
-            jacobian = [torch.as_tensor(jac, device=ctx.torch_device) for jac in ctx_jacs[i]]
-            ctx_jacs[i] = tuple(jacobian)
-        ctx.jacs = tuple(ctx_jacs)
+        return type(r)(_res_to_torch(t, ctx) for t in r)
+    return torch.as_tensor(r, device=ctx.torch_device)
diff --git a/tests/interfaces/default_qubit_2_integration/test_torch_default_qubit_2.py b/tests/interfaces/default_qubit_2_integration/test_torch_default_qubit_2.py
index d4ae61f39b2..c193dd3feeb 100644
--- a/tests/interfaces/default_qubit_2_integration/test_torch_default_qubit_2.py
+++ b/tests/interfaces/default_qubit_2_integration/test_torch_default_qubit_2.py
@@ -15,10 +15,13 @@
 import numpy as np
 import pytest
 
+from param_shift_dev import ParamShiftDerivativesDevice
+
 import pennylane as qml
 from pennylane.devices import DefaultQubit
 from pennylane.gradients import param_shift
 from pennylane.interfaces import execute
+from pennylane.measurements import Shots
 
 torch = pytest.importorskip("torch")
 
@@ -128,11 +131,35 @@ def cost_cache(x):
 # add tests for lightning 2 when possible
 # set rng for device when possible
 test_matrix = [
-    ({"gradient_fn": param_shift}, 100000, DefaultQubit(seed=42)),
-    ({"gradient_fn": param_shift}, None, DefaultQubit()),
-    ({"gradient_fn": "backprop"}, None, DefaultQubit()),
-    ({"gradient_fn": "adjoint", "grad_on_execution": True}, None, DefaultQubit()),
-    ({"gradient_fn": "adjoint", "grad_on_execution": False}, None, DefaultQubit()),
+    ({"gradient_fn": param_shift}, Shots(100000), DefaultQubit(seed=42)),
+    ({"gradient_fn": param_shift}, Shots((100000, 100000)), DefaultQubit(seed=42)),
+    ({"gradient_fn": param_shift}, Shots(None), DefaultQubit()),
+    ({"gradient_fn": "backprop"}, Shots(None), DefaultQubit()),
+    (
+        {"gradient_fn": "adjoint", "grad_on_execution": True, "device_vjp": False},
+        Shots(None),
+        DefaultQubit(),
+    ),
+    (
+        {
+            "gradient_fn": "adjoint",
+            "grad_on_execution": False,
+            "device_vjp": False,
+        },
+        Shots(None),
+        DefaultQubit(),
+    ),
+    ({"gradient_fn": "adjoint", "device_vjp": True}, Shots(None), DefaultQubit()),
+    (
+        {"gradient_fn": "device", "device_vjp": False},
+        Shots((100000, 100000)),
+        ParamShiftDerivativesDevice(),
+    ),
+    (
+        {"gradient_fn": "device", "device_vjp": True},
+        Shots((100000, 100000)),
+        ParamShiftDerivativesDevice(),
+    ),
 ]
 
 
@@ -171,11 +198,17 @@ def cost(a, b):
         assert device.tracker.totals["executions"] == 2  # different wires so different hashes
 
         assert len(res) == 2
-        assert res[0].shape == ()
-        assert res[1].shape == ()
-
-        assert qml.math.allclose(res[0], torch.cos(a) * torch.cos(b), atol=atol_for_shots(shots))
-        assert qml.math.allclose(res[1], torch.cos(a) * torch.cos(b), atol=atol_for_shots(shots))
+        if not shots.has_partitioned_shots:
+            assert res[0].shape == ()
+            assert res[1].shape == ()
+        exp = torch.cos(a) * torch.cos(b)
+        if shots.has_partitioned_shots:
+            for shot in range(2):
+                for wire in range(2):
+                    assert qml.math.allclose(res[shot][wire], exp, atol=atol_for_shots(shots))
+        else:
+            for wire in range(2):
+                assert qml.math.allclose(res[wire], exp, atol=atol_for_shots(shots))
 
     def test_scalar_jacobian(self, execute_kwargs, shots, device):
         """Test scalar jacobian calculation"""
@@ -186,7 +219,8 @@ def cost(a):
             return execute([tape], device, **execute_kwargs)[0]
 
         res = torch.autograd.functional.jacobian(cost, a)
-        assert res.shape == ()  # pylint: disable=no-member
+        if not shots.has_partitioned_shots:
+            assert res.shape == ()  # pylint: disable=no-member
 
         # compare to standard tape jacobian
         tape = qml.tape.QuantumScript([qml.RY(a, wires=0)], [qml.expval(qml.PauliZ(0))])
@@ -195,8 +229,13 @@ def cost(a):
         expected = fn(device.execute(tapes))
 
         assert expected.shape == ()
-        assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
-        assert torch.allclose(res, -torch.sin(a), atol=atol_for_shots(shots))
+        if shots.has_partitioned_shots:
+            for i in range(shots.num_copies):
+                assert torch.allclose(res[i], expected, atol=atol_for_shots(shots), rtol=0)
+                assert torch.allclose(res[i], -torch.sin(a), atol=atol_for_shots(shots))
+        else:
+            assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
+            assert torch.allclose(res, -torch.sin(a), atol=atol_for_shots(shots))
 
     def test_jacobian(self, execute_kwargs, shots, device):
         """Test jacobian calculation"""
@@ -207,23 +246,40 @@ def cost(a, b):
             ops = [qml.RY(a, wires=0), qml.RX(b, wires=1), qml.CNOT(wires=[0, 1])]
             m = [qml.expval(qml.PauliZ(0)), qml.expval(qml.PauliY(1))]
             tape = qml.tape.QuantumScript(ops, m, shots=shots)
-            return torch.hstack(execute([tape], device, **execute_kwargs)[0])
+            [res] = execute([tape], device, **execute_kwargs)
+            if shots.has_partitioned_shots:
+                return torch.hstack(res[0] + res[1])
+            return torch.hstack(res)
 
         res = cost(a, b)
         expected = torch.tensor([torch.cos(a), -torch.cos(a) * torch.sin(b)])
-        assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
+        if shots.has_partitioned_shots:
+            assert torch.allclose(res[:2], expected, atol=atol_for_shots(shots), rtol=0)
+            assert torch.allclose(res[2:], expected, atol=atol_for_shots(shots), rtol=0)
+        else:
+            assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
 
         res = torch.autograd.functional.jacobian(cost, (a, b))
         assert isinstance(res, tuple) and len(res) == 2
-        assert res[0].shape == (2,)
-        assert res[1].shape == (2,)
 
         expected = (
             torch.tensor([-torch.sin(a), torch.sin(a) * torch.sin(b)]),
             torch.tensor([0, -torch.cos(a) * torch.cos(b)]),
         )
-        for _r, _e in zip(res, expected):
-            assert torch.allclose(_r, _e, atol=atol_for_shots(shots))
+        if shots.has_partitioned_shots:
+            assert res[0].shape == (4,)
+            assert res[1].shape == (4,)
+
+            for _r, _e in zip(res, expected):
+                assert torch.allclose(_r[:2], _e, atol=atol_for_shots(shots))
+                assert torch.allclose(_r[2:], _e, atol=atol_for_shots(shots))
+
+        else:
+            assert res[0].shape == (2,)
+            assert res[1].shape == (2,)
+
+            for _r, _e in zip(res, expected):
+                assert torch.allclose(_r, _e, atol=atol_for_shots(shots))
 
     def test_tape_no_parameters(self, execute_kwargs, shots, device):
         """Test that a tape with no parameters is correctly
@@ -255,7 +311,10 @@ def cost(params):
                 shots=shots,
             )
             res = execute([tape1, tape2, tape3, tape4], device, **execute_kwargs)
-            res = [qml.math.asarray(r, like="torch") for r in res]
+            if shots.has_partitioned_shots:
+                res = [qml.math.asarray(ri, like="torch") for r in res for ri in r]
+            else:
+                res = [qml.math.asarray(r, like="torch") for r in res]
             return sum(torch.hstack(res))
 
         params = torch.tensor([0.1, 0.2], requires_grad=True)
@@ -263,11 +322,18 @@ def cost(params):
 
         res = cost(params)
         expected = 2 + np.cos(0.5) + np.cos(x) * np.cos(y)
-        assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
+
+        if shots.has_partitioned_shots:
+            assert torch.allclose(res, 2 * expected, atol=atol_for_shots(shots), rtol=0)
+        else:
+            assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
 
         res.backward()
         expected = torch.tensor([-torch.cos(y) * torch.sin(x), -torch.cos(x) * torch.sin(y)])
-        assert torch.allclose(params.grad, expected, atol=atol_for_shots(shots), rtol=0)
+        if shots.has_partitioned_shots:
+            assert torch.allclose(params.grad, 2 * expected, atol=atol_for_shots(shots), rtol=0)
+        else:
+            assert torch.allclose(params.grad, expected, atol=atol_for_shots(shots), rtol=0)
 
     @pytest.mark.skip("torch cannot reuse tensors in various computations")
     def test_tapes_with_different_return_size(self, execute_kwargs, shots, device):
@@ -380,8 +446,9 @@ def cost(a, b, c):
 
         # Only two arguments are trainable
         assert isinstance(res, tuple) and len(res) == 2
-        assert res[0].shape == ()
-        assert res[1].shape == ()
+        if not shots.has_partitioned_shots:
+            assert res[0].shape == ()
+            assert res[1].shape == ()
 
         # I tried getting analytic results for this circuit but I kept being wrong and am giving up
 
@@ -696,7 +763,10 @@ def _cost_fn(weights, coeffs1, coeffs2):
                 qml.expval(H2)
 
             tape = qml.tape.QuantumScript.from_queue(q, shots=shots)
-            return torch.hstack(execute([tape], device, **execute_kwargs)[0])
+            res = execute([tape], device, **execute_kwargs)[0]
+            if shots.has_partitioned_shots:
+                return torch.hstack(res[0] + res[1])
+            return torch.hstack(res)
 
         return _cost_fn
 
@@ -747,11 +817,19 @@ def test_multiple_hamiltonians_not_trainable(
 
         res = cost_fn(weights, coeffs1, coeffs2)
         expected = self.cost_fn_expected(weights, coeffs1, coeffs2)
-        assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
+        if shots.has_partitioned_shots:
+            assert torch.allclose(res[:2], expected, atol=atol_for_shots(shots), rtol=0)
+            assert torch.allclose(res[2:], expected, atol=atol_for_shots(shots), rtol=0)
+        else:
+            assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
 
         res = torch.autograd.functional.jacobian(lambda w: cost_fn(w, coeffs1, coeffs2), weights)
         expected = self.cost_fn_jacobian(weights, coeffs1, coeffs2)[:, :2]
-        assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
+        if shots.has_partitioned_shots:
+            assert torch.allclose(res[:2, :], expected, atol=atol_for_shots(shots), rtol=0)
+            assert torch.allclose(res[2:, :], expected, atol=atol_for_shots(shots), rtol=0)
+        else:
+            assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
 
     def test_multiple_hamiltonians_trainable(self, execute_kwargs, cost_fn, shots, use_new_op_math):
         """Test hamiltonian with trainable parameters."""
@@ -766,8 +844,17 @@ def test_multiple_hamiltonians_trainable(self, execute_kwargs, cost_fn, shots, u
 
         res = cost_fn(weights, coeffs1, coeffs2)
         expected = self.cost_fn_expected(weights, coeffs1, coeffs2)
-        assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
+        if shots.has_partitioned_shots:
+            assert torch.allclose(res[:2], expected, atol=atol_for_shots(shots), rtol=0)
+            assert torch.allclose(res[2:], expected, atol=atol_for_shots(shots), rtol=0)
+        else:
+            assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
 
         res = torch.hstack(torch.autograd.functional.jacobian(cost_fn, (weights, coeffs1, coeffs2)))
         expected = self.cost_fn_jacobian(weights, coeffs1, coeffs2)
-        assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
+        if shots.has_partitioned_shots:
+            pytest.xfail(
+                "multiple hamiltonians with shot vectors does not seem to be differentiable."
+            )
+        else:
+            assert torch.allclose(res, expected, atol=atol_for_shots(shots), rtol=0)
diff --git a/tests/interfaces/default_qubit_2_integration/test_torch_qnode_default_qubit_2.py b/tests/interfaces/default_qubit_2_integration/test_torch_qnode_default_qubit_2.py
index 9250186f58b..7a18a6e7bc7 100644
--- a/tests/interfaces/default_qubit_2_integration/test_torch_qnode_default_qubit_2.py
+++ b/tests/interfaces/default_qubit_2_integration/test_torch_qnode_default_qubit_2.py
@@ -1330,7 +1330,7 @@ def circuit():
     def test_counts_expval(self):
         """Test counts works as expected if combined with expectation values"""
 
-        @qnode(DefaultQubit(), diff_method="parameter-shift", interface="torch")
+        @qnode(qml.device("default.qubit"), diff_method="parameter-shift", interface="torch")
         def circuit():
             qml.Hadamard(wires=[0])
             qml.CNOT(wires=[0, 1])
diff --git a/tests/interfaces/test_torch.py b/tests/interfaces/test_torch.py
index 2270182307d..6e5053246af 100644
--- a/tests/interfaces/test_torch.py
+++ b/tests/interfaces/test_torch.py
@@ -332,11 +332,10 @@ def cost(a, cache):
                 interface="torch",
             )[0]
 
-        # Without caching, 3 evaluations are required.
-        # 1 for the forward pass, and one per output dimension
-        # on the backward pass.
+        # Without caching, 2 evaluations are required.
+        # 1 for the forward pass, and one for the backward pass
         torch_functional.jacobian(lambda x: cost(x, cache=None), params)
-        assert dev.num_executions == 3
+        assert dev.num_executions == 2
 
         # With caching, only 2 evaluations are required. One
         # for the forward pass, and one for the backward pass.
diff --git a/tests/interfaces/test_torch_qnode.py b/tests/interfaces/test_torch_qnode.py
index 4218a6fc26b..3e6977d5cb0 100644
--- a/tests/interfaces/test_torch_qnode.py
+++ b/tests/interfaces/test_torch_qnode.py
@@ -1487,6 +1487,8 @@ def test_hamiltonian_expansion_finite_shots(
         elif diff_method == "hadamard":
             pytest.skip("The hadamard method does not yet support Hamiltonians")
 
+        np.random.seed(1235)
+
         dev = qml.device(dev_name, wires=3, shots=50000)
         spy = mocker.spy(qml.transforms, "hamiltonian_expand")
         obs = [qml.PauliX(0), qml.PauliX(0) @ qml.PauliZ(1), qml.PauliZ(0) @ qml.PauliZ(1)]