Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
multiphaseCFD committed Sep 11, 2024
1 parent ffcb279 commit 7de991f
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 68 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -406,69 +406,36 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
*/
void get_state_tensor(ComplexT *host_data,
const int32_t numHyperSamples = 1) {
std::vector<std::size_t> wires(BaseType::getNumQubits());
std::iota(wires.begin(), wires.end(), 0);
std::vector<int32_t> projected_modes{};
std::vector<int64_t> projectedModeValues{};

const std::size_t length = std::size_t{1} << wires.size();
const std::size_t length = std::size_t{1} << BaseType::getNumQubits();

DataBuffer<CFP_t, int> d_output_tensor(length, getDevTag(), true);

get_state_tensor(d_output_tensor.getData(), d_output_tensor.getLength(),
wires, numHyperSamples);
get_accessor_(d_output_tensor.getData(), length, projected_modes,
projectedModeValues, numHyperSamples);

d_output_tensor.CopyGpuDataToHost(host_data, length);
}

/**
* @brief Get a slice of the full state tensor
* @brief Get a slice of the full state tensor.
*
* @param tensor_data Pointer to the device memory for state tensor data.
* @param tensor_data_size Size of the state tensor data.
* @param wires Wires to get the state tensor for.
* @param projected_modes Projected modes to get the state tensor for.
* @param projectedModeValues Values of the projected modes.
* @param numHyperSamples Number of hyper samples to use in the calculation
* and is set to 1 by default.
*/
void get_state_tensor(CFP_t *tensor_data,
const std::size_t tensor_data_size,
const std::vector<std::size_t> &wires,
const std::vector<int32_t> &projected_modes,
const std::vector<int64_t> &projectedModeValues,
const int32_t numHyperSamples = 1) const {
auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
wires, BaseType::getNumQubits());

std::vector<int32_t> projected_modes{};

for (int32_t idx = 0;
idx < static_cast<int32_t>(BaseType::getNumQubits()); idx++) {
auto it = std::find(stateModes.begin(), stateModes.end(), idx);
if (it == stateModes.end()) {
projected_modes.emplace_back(idx);
}
}

std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);

if (projected_modes.empty()) {
get_accessor_(tensor_data, tensor_data_size, projected_modes,
projectedModeValues, numHyperSamples);
} else {
DataBuffer<CFP_t, int> tmp(tensor_data_size, getDevTag(), true);

const std::size_t projected_modes_size = std::size_t(1)
<< projected_modes.size();
for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
for (std::size_t j = 0; j < projected_modes.size(); j++) {
projectedModeValues[j] = (idx >> j) & 1;
}

get_accessor_(tmp.getData(), tensor_data_size, projected_modes,
projectedModeValues, numHyperSamples);
// Copy the data to the output tensor
scaleAndAddC_CUDA(std::complex<PrecisionT>{1.0, 0.0},
tmp.getData(), tensor_data, tmp.getLength(),
getDevTag().getDeviceID(),
getDevTag().getStreamID(), getCublasCaller());
}
}
get_accessor_(tensor_data, tensor_data_size, projected_modes,
projectedModeValues, numHyperSamples);
}

private:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,24 +106,72 @@ template <class TensorNetT> class MeasurementsTNCuda {
DataBuffer<CFP_t, int> d_output_tensor(
length, tensor_network_.getDevTag(), true);

DataBuffer<PrecisionT, int> d_output_probs(
length, tensor_network_.getDevTag(), true);

d_output_tensor.zeroInit();
d_output_probs.zeroInit();

tensor_network_.get_state_tensor(d_output_tensor.getData(),
d_output_tensor.getLength(), wires,
numHyperSamples);
auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
wires, tensor_network_.getNumQubits());

// `10` here means `1024` elements to be calculated
// LCOV_EXCL_START
if (wires.size() > 10) {
DataBuffer<PrecisionT, int> d_output_probs(
length, tensor_network_.getDevTag(), true);
std::vector<int32_t> projected_modes{};

for (int32_t idx = 0;
idx < static_cast<int32_t>(tensor_network_.getNumQubits());
idx++) {
auto it = std::find(stateModes.begin(), stateModes.end(), idx);
if (it == stateModes.end()) {
projected_modes.emplace_back(idx);
}
}

std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);

if (projected_modes.size() == 0) {
tensor_network_.get_state_tensor(d_output_tensor.getData(),
d_output_tensor.getLength(), {},
{}, numHyperSamples);
getProbs_CUDA(d_output_tensor.getData(), d_output_probs.getData(),
length, static_cast<int>(thread_per_block),
tensor_network_.getDevTag().getStreamID());

PrecisionT sum;
} else {
PL_ABORT_IF(projected_modes.size() > 63,
"Number of projected modes is greater than 63.");
const std::size_t projected_modes_size = std::size_t(1)
<< projected_modes.size();

DataBuffer<PrecisionT, int> tmp_probs(
length, tensor_network_.getDevTag(), true);

for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
for (std::size_t j = 0; j < projected_modes.size(); j++) {
projectedModeValues[j] = (idx >> j) & 1;
}

tensor_network_.get_state_tensor(
d_output_tensor.getData(), length, projected_modes,
projectedModeValues, numHyperSamples);

getProbs_CUDA(d_output_tensor.getData(), tmp_probs.getData(),
length, static_cast<int>(thread_per_block),
tensor_network_.getDevTag().getStreamID());

// Copy the data to the output tensor
scaleAndAdd_CUDA(PrecisionT{1.0}, tmp_probs.getData(),
d_output_probs.getData(),
tmp_probs.getLength(),
tensor_network_.getDevTag().getDeviceID(),
tensor_network_.getDevTag().getStreamID(),
tensor_network_.getCublasCaller());
}
}

// `10` here means `1024` elements to be calculated
// LCOV_EXCL_START
if (wires.size() > 10) {
PrecisionT sum;
asum_CUDA_device<PrecisionT>(
d_output_probs.getData(), length,
tensor_network_.getDevTag().getDeviceID(),
Expand All @@ -137,30 +185,20 @@ template <class TensorNetT> class MeasurementsTNCuda {
tensor_network_.getDevTag().getStreamID());

d_output_probs.CopyGpuDataToHost(h_res.data(), h_res.size());

} else {
// LCOV_EXCL_STOP
// This branch dispatches the calculation to the CPU for a small
// number of wires. The CPU calculation is faster than the GPU
// calculation for a small number of wires due to the overhead of
// the GPU kernel launch.
std::vector<ComplexT> h_state_vector(length);
d_output_tensor.CopyGpuDataToHost(h_state_vector.data(),
h_state_vector.size());
// TODO: OMP support
for (std::size_t i = 0; i < length; i++) {
h_res[i] = std::norm(h_state_vector[i]);
}
d_output_probs.CopyGpuDataToHost(h_res.data(), h_res.size());

// TODO: OMP support
PrecisionT sum = std::accumulate(h_res.begin(), h_res.end(), 0.0);
PrecisionT sum =
std::accumulate(h_res.begin(), h_res.end(), PrecisionT{0.0});

PL_ABORT_IF(sum == 0.0, "Sum of probabilities is zero.");
// TODO: OMP support
for (std::size_t i = 0; i < length; i++) {
h_res[i] /= sum;
}
}

return h_res;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,16 @@ TEMPLATE_TEST_CASE("Probabilities", "[Measures]", float, double) {
auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
REQUIRE_THROWS_AS(measure.probs({2, 1}), LightningException);
}

SECTION("Test excessive projected wires failure") {
// Defining the State Vector that will be measured.
std::size_t bondDim = GENERATE(2, 3, 4, 5);
std::size_t num_qubits = 100;
std::size_t maxBondDim = bondDim;

TensorNetT mps_state{num_qubits, maxBondDim};

auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
REQUIRE_THROWS_AS(measure.probs({0, 1, 2, 3}), LightningException);
}
}
28 changes: 28 additions & 0 deletions pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,34 @@ inline auto scaleAndAddC_CUDA(const CFP_t a, const T *v1, T *v2,
}
}

/**
* @brief cuBLAS backed GPU SAXPY/DAXPY.
*
* @tparam T Float data-type. Accepts float and double
* @param a scaling factor
* @param v1 Device data pointer 1 (data to be modified)
* @param v2 Device data pointer 2 (the result data)
* @param data_size Length of device data.
* @param dev_id the device on which the function should be executed.
* @param stream_id the CUDA stream on which the operation should be executed.
* @param cublas the CublasCaller object that manages the cuBLAS handle.
*/

template <class T = double, class DevTypeID = int>
inline auto scaleAndAdd_CUDA(const T a, const T *v1, T *v2, const int data_size,
DevTypeID dev_id, cudaStream_t stream_id,
const CublasCaller &cublas) {
if constexpr (std::is_same_v<T, float>) {
const float alpha = a;
cublas.call(cublasSaxpy, dev_id, stream_id, data_size, &alpha, v1, 1,
v2, 1);
} else if constexpr (std::is_same_v<T, double>) {
const double alpha = a;
cublas.call(cublasDaxpy, dev_id, stream_id, data_size, &alpha, v1, 1,
v2, 1);
}
}

/**
* @brief cuBLAS backed GPU data scaling.
*
Expand Down
22 changes: 22 additions & 0 deletions tests/lightning_tensor/test_measurements_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,25 @@ def test_not_measure_tensor_network(self, lightning_tn):

with pytest.raises(NotImplementedError):
m.measure_tensor_network(tape)

@pytest.mark.parametrize("n_qubits", range(4, 12, 4))
@pytest.mark.parametrize("n_targets", list(range(1, 4)) + list(range(4, 12, 4)))
def test_probs_many_wires(self, n_qubits, n_targets, tol):
"""Test probs measuring many wires of a random quantum state."""
if n_targets >= n_qubits:
pytest.skip("Number of targets cannot exceed the number of wires.")

dev = qml.device(device_name, wires=n_qubits)
dq = qml.device("default.qubit", wires=n_qubits)

init_state = np.random.rand(2**n_qubits) + 1.0j * np.random.rand(2**n_qubits)
init_state /= np.linalg.norm(init_state)

def circuit():
qml.StatePrep(init_state, wires=range(n_qubits))
return qml.probs(wires=range(0, n_targets))

res = qml.QNode(circuit, dev)()
ref = qml.QNode(circuit, dq)()

assert np.allclose(res, ref, atol=tol, rtol=0)

0 comments on commit 7de991f

Please sign in to comment.