diff --git a/pennylane/gradients/gradient_transform.py b/pennylane/gradients/gradient_transform.py
index bf01f4fc83b..ca47e76aa25 100644
--- a/pennylane/gradients/gradient_transform.py
+++ b/pennylane/gradients/gradient_transform.py
@@ -295,20 +295,22 @@ def _no_trainable_grad(tape):
     return [], lambda _: tuple(qml.math.zeros([0]) for _ in range(len(tape.measurements)))
 
 
-def _swap_first_two_axes(grads, first_axis_size, second_axis_size):
+def _swap_first_two_axes(grads, first_axis_size, second_axis_size, squeeze=True):
     """Transpose the first two axes of an iterable of iterables, returning
-    a tuple of tuples."""
-    if first_axis_size == 1:
+    a tuple of tuples. Tuple version of ``np.moveaxis(grads, 0, 1)``"""
+    if first_axis_size == 1 and squeeze:
         return tuple(grads[0][i] for i in range(second_axis_size))
     return tuple(
         tuple(grads[j][i] for j in range(first_axis_size)) for i in range(second_axis_size)
     )
 
 
-def _move_first_axis_to_third_pos(grads, first_axis_size, second_axis_size, third_axis_size):
+def _move_first_axis_to_third_pos(
+    grads, first_axis_size, second_axis_size, third_axis_size, squeeze=True
+):
     """Transpose the first two axes of an iterable of iterables, returning
-    a tuple of tuples."""
-    if first_axis_size == 1:
+    a tuple of tuples. Tuple version of ``np.moveaxis(grads, 0, 2)``"""
+    if first_axis_size == 1 and squeeze:
         return tuple(
             tuple(grads[0][i][j] for j in range(third_axis_size)) for i in range(second_axis_size)
         )
diff --git a/pennylane/gradients/parameter_shift.py b/pennylane/gradients/parameter_shift.py
index 9036378f712..b9f558c2e81 100644
--- a/pennylane/gradients/parameter_shift.py
+++ b/pennylane/gradients/parameter_shift.py
@@ -24,7 +24,6 @@
 
 import pennylane as qml
 from pennylane import transform
-from pennylane.gradients.gradient_transform import _contract_qjac_with_cjac
 from pennylane.measurements import VarianceMP
 
 from .finite_difference import finite_diff
@@ -36,7 +35,10 @@
 )
 from .gradient_transform import (
     _all_zero_grad,
+    _contract_qjac_with_cjac,
+    _move_first_axis_to_third_pos,
     _no_trainable_grad,
+    _swap_first_two_axes,
     assert_multimeasure_not_broadcasted,
     assert_no_state_returns,
     assert_no_trainable_tape_batching,
@@ -152,7 +154,7 @@ def _single_meas_grad(result, coeffs, unshifted_coeff, r0):
 
     If an unshifted term exists, its contribution is added to the gradient.
     """
-    if isinstance(result, list) and result == []:
+    if isinstance(result, tuple) and result == ():
         if unshifted_coeff is None:
             raise ValueError(
                 "This gradient component neither has a shifted nor an unshifted component. "
@@ -173,19 +175,14 @@ def _single_meas_grad(result, coeffs, unshifted_coeff, r0):
 def _multi_meas_grad(res, coeffs, r0, unshifted_coeff, num_measurements):
     """Compute the gradient for multiple measurements by taking the linear combination of
     the coefficients and each measurement result."""
-    g = []
     if r0 is None:
         r0 = [None] * num_measurements
-    for meas_idx in range(num_measurements):
-        # Gather the measurement results
-        meas_result = [param_result[meas_idx] for param_result in res]
-        g_component = _single_meas_grad(meas_result, coeffs, unshifted_coeff, r0[meas_idx])
-        g.append(g_component)
+    if res == ():
+        res = tuple(() for _ in range(num_measurements))
+    return tuple(_single_meas_grad(r, coeffs, unshifted_coeff, r0_) for r, r0_ in zip(res, r0))
 
-    return tuple(g)
 
-
-def _evaluate_gradient(tape, res, data, r0):
+def _evaluate_gradient(tape_specs, res, data, r0, batch_size):
     """Use shifted tape evaluations and parameter-shift rule coefficients to evaluate
     a gradient result. If res is an empty list, ``r0`` and ``data[3]``, which is the
     coefficient for the unshifted term, must be given and not None.
@@ -197,42 +194,46 @@ def _evaluate_gradient(tape, res, data, r0):
     if fn is not None:
         res = fn(res)
 
-    num_measurements = len(tape.measurements)
+    *_, num_measurements, shots = tape_specs
+    scalar_shots, len_shot_vec = not shots.has_partitioned_shots, shots.num_copies
+
+    if r0 is None and not scalar_shots:
+        r0 = [None] * int(len_shot_vec)
 
     if num_measurements == 1:
-        if not tape.shots.has_partitioned_shots:
+        if scalar_shots:
+            # Res has axes (parameters,)
             return _single_meas_grad(res, coeffs, unshifted_coeff, r0)
-        g = []
-        len_shot_vec = tape.shots.num_copies
-        # Res has order of axes:
-        # 1. Number of parameters
-        # 2. Shot vector
-        if r0 is None:
-            r0 = [None] * int(len_shot_vec)
-        for i in range(len_shot_vec):
-            shot_comp_res = [r[i] for r in res]
-            shot_comp_res = _single_meas_grad(shot_comp_res, coeffs, unshifted_coeff, r0[i])
-            g.append(shot_comp_res)
-        return tuple(g)
-
-    g = []
-    if not tape.shots.has_partitioned_shots:
+        # Res has axes (parameters, shots) or with broadcasting (shots, parameters)
+        if batch_size is None:
+            # Move shots to first position
+            res = _swap_first_two_axes(res, len(res), len_shot_vec, squeeze=False)
+        # _single_meas_grad expects axis (parameters,), iterate over shot vector
+        return tuple(_single_meas_grad(r, coeffs, unshifted_coeff, r0_) for r, r0_ in zip(res, r0))
+
+    if scalar_shots:
+        # Res has axes (parameters, measurements) or with broadcasting (measurements, parameters)
+        if batch_size is None and len(res) > 0:
+            # Move measurements to first position
+            res = _swap_first_two_axes(res, len(res), num_measurements, squeeze=False)
+        # _multi_meas_grad expects axes (measurements, parameters)
         return _multi_meas_grad(res, coeffs, r0, unshifted_coeff, num_measurements)
 
-    # Res has order of axes:
-    # 1. Number of parameters
-    # 2. Shot vector
-    # 3. Number of measurements
-    for idx_shot_comp in range(tape.shots.num_copies):
-        single_shot_component_result = [
-            result_for_each_param[idx_shot_comp] for result_for_each_param in res
-        ]
-        multi_meas_grad = _multi_meas_grad(
-            single_shot_component_result, coeffs, r0, unshifted_coeff, num_measurements
-        )
-        g.append(multi_meas_grad)
-
-    return tuple(g)
+    # Res has axes (parameters, shots, measurements)
+    # or with broadcasting (shots, measurements, parameters)
+    if batch_size is None:
+        if len(res) > 0:
+            # Move first axis (parameters) to last position
+            res = _move_first_axis_to_third_pos(
+                res, len(res), len_shot_vec, num_measurements, squeeze=False
+            )
+        else:
+            res = (() for _ in range(len_shot_vec))
+    # _multi_meas_grad expects (measurements, parameters), so we iterate over shot vector
+    return tuple(
+        _multi_meas_grad(r, coeffs, r0_, unshifted_coeff, num_measurements)
+        for r, r0_ in zip(res, r0)
+    )
 
 
 def _get_operation_recipe(tape, t_idx, shifts, order=1):
@@ -424,14 +425,14 @@ def processing_fn(results):
                     grads.append(None)
                     continue
                 # The gradient for this parameter is computed from r0 alone.
-                g = _evaluate_gradient(tape, [], data, r0)
+                g = _evaluate_gradient(tape_specs, (), data, r0, batch_size)
                 grads.append(g)
                 continue
 
             res = results[start : start + num_tapes] if batch_size is None else results[start]
             start = start + num_tapes
 
-            g = _evaluate_gradient(tape, res, data, r0)
+            g = _evaluate_gradient(tape_specs, res, data, r0, batch_size)
             grads.append(g)
 
         # g will have been defined at least once (because otherwise all gradients would have
diff --git a/tests/gradients/parameter_shift/test_parameter_shift.py b/tests/gradients/parameter_shift/test_parameter_shift.py
index 4a33f631ba0..6823a0a3af7 100644
--- a/tests/gradients/parameter_shift/test_parameter_shift.py
+++ b/tests/gradients/parameter_shift/test_parameter_shift.py
@@ -20,12 +20,221 @@
 from pennylane.devices import DefaultQubitLegacy
 from pennylane.gradients import param_shift
 from pennylane.gradients.parameter_shift import (
+    _evaluate_gradient,
     _get_operation_recipe,
     _make_zero_rep,
     _put_zeros_in_pdA2_involutory,
 )
+from pennylane.measurements.shots import Shots
 from pennylane.operation import AnyWires, Observable
 
+# Constants for TestEvaluateGradient
+# Coefficients and expectation values
+X = np.arange(1, 5)
+# Expected "shift rule" result
+Z = np.sum(-np.arange(1, 5) ** 2)
+# Single coefficient/expectation value that leads to the same result as X
+w = np.sqrt(30)
+# Prefactors to emulate a shot vector
+shv = np.array([0.1, 0.4, 0.7])
+# Fake probability vector (just a 1d array)
+p = np.array([0.01, 0.06, -0.2, 0.5, -0.1, 0.7, -0.09])
+# Second fake probability vector (just a 1d array)
+p2 = p[1:5]
+# shifted probability evaluations
+P = np.outer(X, p)
+# shifted probability evaluations for p2
+P2 = np.outer(X, p2)
+# Single unshifted result that lead to the same result as P
+v = w * p
+# Single unshifted result that lead to the same result as P2
+v2 = w * p2
+# Prefactors to emulate different shot values and multi measurement
+shv_m = np.outer([0.1, 0.4, 0.7], [1, 2])
+
+
+class TestEvaluateGradient:
+    """Test _evaluate_gradient."""
+
+    # pylint: disable=too-many-arguments
+
+    # We could theoretically compute the required res, r0 and expected from the parametrization of coeffs,
+    # unshifted_coeff and batch_size, but that turned out to take lots of effort and edge case logic
+
+    test_cases_single_shots_single_meas = [
+        # Expectation value
+        (X, None, None, tuple(-X), None, Z),
+        (X, None, 4, -X, None, Z),
+        (X[:-1], X[-1], None, tuple(-X[:-1]), -X[-1], Z),
+        (X[:-1], X[-1], 4, -X[:-1], -X[-1], Z),
+        (np.ones(0), w, None, (), -w, Z),
+        (np.ones(0), w, 4, (), -w, Z),
+        # Probability
+        (X, None, None, tuple(-P), None, p * Z),
+        (X, None, 4, -P, None, p * Z),
+        (X[:-1], X[-1], None, tuple(-P[:-1]), -P[-1], p * Z),
+        (X[:-1], X[-1], 4, -P[:-1], -P[-1], p * Z),
+        (np.ones(0), w, None, (), -v, p * Z),
+        (np.ones(0), w, 4, (), -v, p * Z),
+    ]
+
+    @pytest.mark.parametrize(
+        "coeffs, unshifted_coeff, batch_size, res, r0, expected",
+        test_cases_single_shots_single_meas,
+    )
+    def test_single_shots_single_meas(self, coeffs, unshifted_coeff, batch_size, res, r0, expected):
+        """Test that a single shots, single measurement gradient is evaluated correctly."""
+
+        shots = Shots(100)
+        tape_specs = (None, None, 1, shots)
+        data = [None, coeffs, None, unshifted_coeff, None]
+        grad = _evaluate_gradient(tape_specs, res, data, r0, batch_size)
+
+        assert isinstance(grad, np.ndarray)
+        assert grad.shape == expected.shape
+        assert np.allclose(grad, expected)
+
+    exp_probs = (p2 * Z, 2 * p * Z)
+    test_cases_single_shots_multi_meas = [
+        # Expectation values
+        (X, None, None, tuple(zip(-X, -2 * X)), None, (Z, 2 * Z)),
+        (X, None, 4, (-X, -2 * X), None, (Z, 2 * Z)),
+        (X[:-1], X[-1], None, tuple(zip(-X[:-1], -2 * X[:-1])), (-X[-1], -2 * X[-1]), (Z, 2 * Z)),
+        (X[:-1], X[-1], 4, (-X[:-1], -2 * X[:-1]), (-X[-1], -2 * X[-1]), (Z, 2 * Z)),
+        (np.ones(0), w, None, (), (-w, -2 * w), (Z, 2 * Z)),
+        (np.ones(0), w, 4, (), (-w, -2 * w), (Z, 2 * Z)),
+        # Expval and Probability
+        (X, None, None, tuple(zip(-X, -2 * P)), None, (Z, 2 * p * Z)),
+        (X, None, 4, (-X, -2 * P), None, (Z, 2 * p * Z)),
+        (X[:-1], X[-1], None, tuple(zip(-X, -2 * P))[:-1], (-X[-1], -2 * P[-1]), (Z, 2 * p * Z)),
+        (X[:-1], X[-1], 4, (-X[:-1], -2 * P[:-1]), (-X[-1], -2 * P[-1]), (Z, 2 * p * Z)),
+        (np.ones(0), w, None, (), (-w, -2 * v), (Z, 2 * p * Z)),
+        (np.ones(0), w, 4, (), (-w, -2 * v), (Z, 2 * p * Z)),
+        # Probabilities
+        (X, None, None, tuple(zip(-P2, -2 * P)), None, exp_probs),
+        (X, None, 4, (-P2, -2 * P), None, exp_probs),
+        (X[:-1], X[-1], None, tuple(zip(-P2, -2 * P))[:-1], (-P2[-1], -2 * P[-1]), exp_probs),
+        (X[:-1], X[-1], 4, (-P2[:-1], -2 * P[:-1]), (-P2[-1], -2 * P[-1]), exp_probs),
+        (np.ones(0), w, None, (), (-v2, -2 * v), exp_probs),
+        (np.ones(0), w, 4, (), (-v2, -2 * v), exp_probs),
+    ]
+
+    @pytest.mark.parametrize(
+        "coeffs, unshifted_coeff, batch_size, res, r0, expected",
+        test_cases_single_shots_multi_meas,
+    )
+    def test_single_shots_multi_meas(self, coeffs, unshifted_coeff, batch_size, res, r0, expected):
+        """Test that a single shots, multiple measurements gradient is evaluated correctly."""
+
+        shots = Shots(100)
+        tape_specs = (None, None, 2, shots)
+        data = [None, coeffs, None, unshifted_coeff, None]
+        grad = _evaluate_gradient(tape_specs, res, data, r0, batch_size)
+
+        assert isinstance(grad, tuple) and len(grad) == 2
+        for g, e in zip(grad, expected):
+            assert isinstance(g, np.ndarray) and g.shape == e.shape
+            assert np.allclose(g, e)
+
+    shot_vec_X = tuple(zip(*(-c * X for c in shv)))
+    shot_vec_P = tuple(zip(*(-c * P for c in shv)))
+    shot_vec_P_partial = tuple(-c * P[:-1] for c in shv)
+
+    exp_shot_vec_prob = np.outer(shv, p) * Z
+    test_cases_multi_shots_single_meas = [
+        # Expectation value
+        (X, None, None, shot_vec_X, None, shv * Z),
+        (X, None, 4, tuple(-c * X for c in shv), None, shv * Z),
+        (X[:-1], X[-1], None, shot_vec_X[:-1], shot_vec_X[-1], shv * Z),
+        (X[:-1], X[-1], 4, tuple(-c * X[:-1] for c in shv), tuple(-shv * X[-1]), shv * Z),
+        (np.ones(0), w, None, (), tuple(-c * w for c in shv), shv * Z),
+        (np.ones(0), w, 4, ((), (), ()), tuple(-c * w for c in shv), shv * Z),
+        # Probability
+        (X, None, None, shot_vec_P, None, exp_shot_vec_prob),
+        (X, None, 4, tuple(-c * P for c in shv), None, exp_shot_vec_prob),
+        (X[:-1], X[-1], None, shot_vec_P[:-1], shot_vec_P[-1], exp_shot_vec_prob),
+        (X[:-1], X[-1], 4, shot_vec_P_partial, tuple(np.outer(-shv, P[-1])), exp_shot_vec_prob),
+        (np.ones(0), w, None, (), tuple(-c * v for c in shv), exp_shot_vec_prob),
+        (np.ones(0), w, 4, ((), (), ()), tuple(-c * v for c in shv), exp_shot_vec_prob),
+    ]
+
+    @pytest.mark.parametrize(
+        "coeffs, unshifted_coeff, batch_size, res, r0, expected",
+        test_cases_multi_shots_single_meas,
+    )
+    def test_multi_shots_single_meas(self, coeffs, unshifted_coeff, batch_size, res, r0, expected):
+        """Test that a shot vector, single measurements gradient is evaluated correctly."""
+
+        shots = Shots((100, 101, 102))
+        tape_specs = (None, None, 1, shots)
+        data = [None, coeffs, None, unshifted_coeff, None]
+        grad = _evaluate_gradient(tape_specs, res, data, r0, batch_size)
+
+        assert isinstance(grad, tuple) and len(grad) == 3
+        for g, e in zip(grad, expected):
+            assert isinstance(g, np.ndarray) and g.shape == e.shape
+            assert np.allclose(g, e)
+
+    multi_X = tuple(tuple((-c * x, -2 * c * x) for c in shv) for x in X)
+    batched_multi_X = tuple((-c * X, -2 * c * X) for c in shv)
+    partial_multi_X = tuple((-c * X[:-1], -2 * c * X[:-1]) for c in shv)
+    expvals_r0 = tuple((-c * w, -2 * c * w) for c in shv)
+
+    multi_X_P = tuple(tuple((-c * _p, -2 * c * x) for c in shv) for x, _p in zip(X, P))
+    batched_multi_X_P = tuple((-c * P, -2 * c * X) for c in shv)
+    partial_multi_X_P = tuple((-c * P[:-1], -2 * c * X[:-1]) for c in shv)
+    prob_expval_r0 = tuple((-c * v, -2 * c * w) for c in shv)
+
+    multi_P_P = tuple(tuple((-c * _p, -2 * c * _q) for c in shv) for _q, _p in zip(P2, P))
+    batched_multi_P_P = tuple((-c * P, -2 * c * P2) for c in shv)
+    partial_multi_P_P = tuple((-c * P[:-1], -2 * c * P2[:-1]) for c in shv)
+    probs_r0 = tuple((-c * v, -2 * c * v2) for c in shv)
+
+    exp_shot_vec_prob_expval = tuple((c * p * Z, 2 * c * Z) for c in shv)
+    exp_shot_vec_probs = tuple((c * p * Z, 2 * c * p2 * Z) for c in shv)
+    test_cases_multi_shots_multi_meas = [
+        # Expectation values
+        (X, None, None, multi_X, None, shv_m * Z),
+        (X, None, 4, batched_multi_X, None, shv_m * Z),
+        (X[:-1], X[-1], None, multi_X[:-1], multi_X[-1], shv_m * Z),
+        (X[:-1], X[-1], 4, partial_multi_X, multi_X[-1], shv_m * Z),
+        (np.ones(0), w, None, (), expvals_r0, shv_m * Z),
+        (np.ones(0), w, 4, ((), (), ()), expvals_r0, shv_m * Z),
+        # Probability and expectation
+        (X, None, None, multi_X_P, None, exp_shot_vec_prob_expval),
+        (X, None, 4, batched_multi_X_P, None, exp_shot_vec_prob_expval),
+        (X[:-1], X[-1], None, multi_X_P[:-1], multi_X_P[-1], exp_shot_vec_prob_expval),
+        (X[:-1], X[-1], 4, partial_multi_X_P, multi_X_P[-1], exp_shot_vec_prob_expval),
+        (np.ones(0), w, None, (), prob_expval_r0, exp_shot_vec_prob_expval),
+        (np.ones(0), w, 4, ((), (), ()), prob_expval_r0, exp_shot_vec_prob_expval),
+        # Probabilities
+        (X, None, None, multi_P_P, None, exp_shot_vec_probs),
+        (X, None, 4, batched_multi_P_P, None, exp_shot_vec_probs),
+        (X[:-1], X[-1], None, multi_P_P[:-1], multi_P_P[-1], exp_shot_vec_probs),
+        (X[:-1], X[-1], 4, partial_multi_P_P, multi_P_P[-1], exp_shot_vec_probs),
+        (np.ones(0), w, None, (), probs_r0, exp_shot_vec_probs),
+        (np.ones(0), w, 4, ((), (), ()), probs_r0, exp_shot_vec_probs),
+    ]
+
+    @pytest.mark.parametrize(
+        "coeffs, unshifted_coeff, batch_size, res, r0, expected",
+        test_cases_multi_shots_multi_meas,
+    )
+    def test_multi_shots_multi_meas(self, coeffs, unshifted_coeff, batch_size, res, r0, expected):
+        """Test that a shot vector, multiple measurements gradient is evaluated correctly."""
+
+        shots = Shots((100, 101, 102))
+        tape_specs = (None, None, 2, shots)
+        data = [None, coeffs, None, unshifted_coeff, None]
+        grad = _evaluate_gradient(tape_specs, res, data, r0, batch_size)
+
+        assert isinstance(grad, tuple) and len(grad) == 3
+        for g, e in zip(grad, expected):
+            assert isinstance(g, tuple) and len(g) == 2
+            for _g, _e in zip(g, e):
+                assert isinstance(_g, np.ndarray) and _g.shape == _e.shape
+                assert np.allclose(_g, _e)
+
 
 # pylint: disable=too-few-public-methods
 class RY_with_F(qml.RY):
diff --git a/tests/gradients/parameter_shift/test_parameter_shift_cv.py b/tests/gradients/parameter_shift/test_parameter_shift_cv.py
index 0642511267c..8064bb51891 100644
--- a/tests/gradients/parameter_shift/test_parameter_shift_cv.py
+++ b/tests/gradients/parameter_shift/test_parameter_shift_cv.py
@@ -14,7 +14,7 @@
 """Tests for the gradients.parameter_shift_cv module."""
 # pylint: disable=protected-access, no-self-use, not-callable, no-value-for-parameter
 
-import unittest.mock as mock
+from unittest import mock
 
 import pytest